## Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## Import DataSet

In [None]:
df = pd.read_csv('../input/domain-name-security/train.csv')

In [None]:
df

In [None]:
df.describe()

In [None]:
df.info()

### Total percentage of data is missing

In [None]:
missing_values_count = df.isnull().sum()

total_cells = np.product(df.shape)

total_missing = missing_values_count.sum()

percentage_missing = (total_missing/total_cells)*100
print(percentage_missing)

In [None]:
for i in df.columns:
    print(df[i].unique())

## Label encoding from categorical featurtes

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df['Action'])
list(le.classes_)
df['Action'] = le.transform(df['Action']) 
df

### Data Cleaning

In [None]:
NAN = [(c, df[c].isnull().mean()*100) for c in df]
NAN = pd.DataFrame(NAN, columns=['column_name', 'percentage'])
NAN

In [None]:
null_counts = df.isnull().sum()
null_counts

### Replace all NaN values with the most frequent elements

In [None]:
columns_None = ['Source Port','Destination Port','Elapsed Time (sec)','NAT Source Port','Bytes Received','Bytes Sent','NAT Destination Port','Bytes']
df[columns_None] = df[columns_None].fillna(df.mode().iloc[0])

In [None]:
df

### Replace NaN values with the preceeding values 

In [None]:
df.fillna(method='ffill', inplace=True)

In [None]:
df

In [None]:
null_count = df.isnull().sum()
null_count

## show a few features with their Mutual Info scores

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
x = df.copy()
y = x.pop('Action')

In [None]:
for colname in x.select_dtypes('object'):
    x[colname],_ = x[colname].factorize()
    
discrete_features = x.dtypes ==int

## Data Visualization With most Dependent Features

### Multivariate Analysis

In [None]:
sns.histplot(data=df, x="Source Port", kde = True)

In [None]:
sns.relplot(x='Bytes', y='pkts_sent',hue='Action', data=df)

In [None]:
def draw_histograms(dataframe, features, rows, cols):
    fig=plt.figure(figsize=(20,20))
    for i, feature in enumerate(features):
        ax=fig.add_subplot(rows,cols,i+1)
        dataframe[feature].hist(bins=20,ax=ax,facecolor='green')
        ax.set_title(feature+" Distribution",color='Red')
        
    fig.tight_layout()  
    plt.show()
draw_histograms(df,df.columns,4,4)

## Correlation Matrix

In [None]:
r = np.random.RandomState(0)
df1 = pd.DataFrame(r.rand(10,10))
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

### Train Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.20,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_test.shape

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
def print_score(clf, X_train, Y_train, X_test, Y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(Y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(Y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(Y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(Y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(Y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(Y_test, pred)}\n")

## Apply XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, Y_train)

Y_pred_xgb = xgb_model.predict(X_test)

In [None]:
Y_pred_xgb.shape

In [None]:
score_xgb = round(accuracy_score(Y_pred_xgb,Y_test)*100,2)

print("The accuracy score achieved using XGBoost is: "+str(score_xgb)+" %")

## K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,Y_train)
Y_pred_knn=knn.predict(X_test)

In [None]:
Y_pred_knn.shape

In [None]:
score_knn = round(accuracy_score(Y_pred_knn,Y_test)*100,2)

print("The accuracy score achieved using KNN is: "+str(score_knn)+" %")

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(X_train,Y_train)

Y_pred_nb = nb.predict(X_test)

In [None]:
Y_pred_nb.shape

In [None]:
score_nb = round(accuracy_score(Y_pred_nb,Y_test)*100,2)

print("The accuracy score achieved using Naive Bayes is: "+str(score_nb)+" %")

## Final Accuracy of all Models that which Model is best fitted

In [None]:
scores = [score_nb,score_knn,score_xgb]
algorithms = ["Naive Bayes","K-Nearest Neighbors","XGBoost"]    

for i in range(len(algorithms)):
    print("The accuracy score achieved using "+algorithms[i]+" is: "+str(scores[i])+" %")

In [None]:
sns.set(rc={'figure.figsize':(7,6)})
plt.xlabel("Algorithms")
plt.ylabel("Accuracy score")

sns.barplot(algorithms,scores)

In [None]:
df1 = pd.read_csv('../input/domain-name-security/test.csv')

In [None]:
df1

In [None]:
NAN = [(c, df1[c].isnull().mean()*100) for c in df1]
NAN = pd.DataFrame(NAN, columns=['column_name', 'percentage'])
NAN

In [None]:
columns_None = ['Source Port','Destination Port','Elapsed Time (sec)','NAT Source Port','Bytes Received','Bytes Sent','NAT Destination Port','Bytes']
df1[columns_None] = df1[columns_None].fillna(df1.mode().iloc[0])

In [None]:
df1.fillna(method='ffill', inplace=True)

In [None]:
df1.fillna(method='backfill', inplace=True)

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
Y_pred_xgb = xgb_model.predict(df1)

In [None]:
submission = pd.DataFrame({
        "ID": df1['ID'],
        "Action": Y_pred_xgb
    })

In [None]:
submission

In [None]:
submission['Action'] = submission['Action'].replace([0,1,2,3],['allow', 'deny' ,'drop', 'reset-both'])

In [None]:
submission

In [None]:
submission.to_csv('DomainNameSecuritySubmission.csv', index=False)

# Submission on Kaggle

# End Model Prediction