# Exploratory Analysis of IP Addresses Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data = pd.read_csv("../input/ip-network-traffic-flows-labeled-with-87-apps/Dataset-Unicauca-Version2-87Atts.csv", parse_dates=True)
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

## Lets take a look at all the non-numeric columns

In [None]:
non_num_cols = [col for col in data.columns if data[col].dtype == 'O']
non_num_data = data[non_num_cols]
non_num_data

### No. of unique values and their counts in non_numeric columns

In [None]:
[(col, non_num_data[col].nunique()) for col in non_num_cols]

In [None]:
def summarize_cat(col_name):
    sorted_values = sorted(non_num_data[col_name].value_counts().iteritems(), key = lambda x:x[1], reverse=True)
    remaining_per = 100
    for (value, count) in sorted_values:
        per = count / len(non_num_data) * 100
        if per >= 1:
            print(f'{value} : {per:.2f}%')
        else :
            print(f'Others : {remaining_per:.2f}%')
            break
        remaining_per = remaining_per - per

In [None]:
for col in non_num_cols:
    print(f"Summary of {col} column : ")
    summarize_cat(col)
    print('\n')

## Exploratory Analysis for numeric columns

In [None]:
num_cols = list(set(data.columns) - set(non_num_cols))
num_cols

In [None]:
data[num_cols].describe()

In [None]:
[col for col in num_cols if data[col].isnull().any()]

In [None]:
print("range and no. of unique values in numeric columns")
for col in num_cols:
    print(f'{col}\tRange : {max(data[col]) - min(data[col])}, No. of unique values : {data[col].nunique()}')

## For the columns having <=50 unique values, we plot histograms, for others we just list distribution of most frequent values as in case of category columns

In [None]:
cols_for_hist = [col for col in num_cols if data[col].nunique() <= 50]
cols_for_hist, len(cols_for_hist)

In [None]:
cols_for_desc = [col for col in num_cols if data[col].nunique() > 50]
cols_for_desc

In [None]:
data[cols_for_hist].hist(layout = (7,3), figsize = (12, 20))
plt.tight_layout()

## Correlation Matrix

In [None]:
corr = data[num_cols].corr()

In [None]:
f = plt.figure(figsize = (25,25))
plt.matshow(corr, fignum=f.number)
plt.title('Correlation Matrix of Numeric columns in the dataset', fontsize = 20)
plt.xticks(range(len(num_cols)), num_cols, fontsize = 14, rotation = 90)
plt.yticks(range(len(num_cols)), num_cols, fontsize = 14)
plt.gca().xaxis.set_ticks_position('bottom')
cb = plt.colorbar(fraction = 0.0466, pad = 0.02)
cb.ax.tick_params(labelsize=10)
plt.show()

## Data Cleansing and Preprocessing

In [None]:
ipdata = data.copy()

### Remove timestamp and FlowID column

In [None]:
print("No. of unique values in Timestamp column :",ipdata['Timestamp'].nunique())
print("No. of unique values in FlowID column :",ipdata['Flow.ID'].nunique())

In [None]:
ipdata.drop(['Timestamp', 'Flow.ID'], axis = 1, inplace = True)

### Drop all columns with only a single unique value

In [None]:
single_unique_cols = [col for col in ipdata.columns if ipdata[col].nunique() == 1]
single_unique_cols

In [None]:
ipdata.drop(single_unique_cols, axis = 1, inplace = True)

### Convert Ip addresses and ports information
Can convert these to country but I can't find an api to do this for large no. of entries

In [None]:
ip_add_cols = ['Source.IP', 'Source.Port', 'Destination.IP', 'Destination.Port']
ipdata[ip_add_cols]

In [None]:
ipdata.drop(ip_add_cols, axis = 1, inplace = True)

### Label Encode ProtocolName column

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder().fit(ipdata['ProtocolName'])
ipdata['ProtocolName'] = encoder.fit_transform(ipdata['ProtocolName'])
ipdata['ProtocolName']

In [None]:
ipdata.head(10)

In [None]:
ipdata.shape

## Now this data is ready for any predicitive analysis