In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder


from sklearn.model_selection import train_test_split# to split dataset  

from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.metrics import accuracy_score,precision_score, f1_score, recall_score

import time

import pickle

from warnings import filterwarnings
filterwarnings('ignore')


pd.options.mode.chained_assignment = None

# **Import Data**

In [None]:
train = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv', sep=',', encoding='cp1252')
test = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv', sep=',', encoding='cp1252')

> ## ***Concatenate datasets***

In [None]:
df = pd.concat([train, test])

In [None]:
df.shape

> ##  ***Split dataset***

In [None]:
train, test = train_test_split(df, test_size=0.2, shuffle=True)

y_train = train.label
x_train_ = train.drop(['label'], axis=1)

y_test = test.label
x_test_ = test.drop(['label'], axis=1)

> ## **encoding**

In [None]:
x_train, x_test = encoding_frequency(x_train_, x_test_)

In [None]:
check_na_values(x_train, x_test)

> ## **inpute**

In [None]:
x_test.fillna(-1, axis=0, inplace=True)

> ## **Scaling**

In [None]:
minmax = MinMaxScaler()

X_train = minmax.fit_transform(x_train)
X_test = minmax.transform(x_test)

# **LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression

> ## **prepare model 1**

In [None]:
lr1 = LogisticRegression(random_state=42)

>> ### **data for model 1**

In [None]:
data1 = [X_train, y_train, X_test, y_test]

>>> #### **evaluation**

In [None]:
evaluation(lr1, data1)

---
# **Some functions**

In [None]:
def encoding_frequency(train, test):
    cat_features = train.select_dtypes('object')

    for feature in cat_features:
        frequency_mapping = train[feature].value_counts().to_dict()
    
        train[feature] = train[feature].map(frequency_mapping)
        test[feature] = test[feature].map(frequency_mapping)
    
    return train, test 


def check_na_values(X_train, X_test):
    # check na | null values 

    na_train = pd.DataFrame(X_train.isna().sum().to_dict(), index=[0])
    na_test = pd.DataFrame(X_test.isna().sum().to_dict(), index=[0])

    na_values = pd.concat([na_train, na_test])

    pd.set_option('display.max_columns', None)
    return na_values


def select_with_corr(X_train,y_train, X_test, shold):
    tmp = {}
    for i in X_train[X_train.columns[X_train.dtypes != 'object']]:
        crr = X_train[i].corr(y_train)
        if crr > shold:
            tmp[i] = crr
    
    sel_cols = tmp.keys()
    X_train_corr = X_train[sel_cols]
    X_test_corr = X_test[sel_cols]
    
    return X_train_corr, X_test_corr



def evaluation(model, data):
    import time 
    X,Y, x,y = data[0],data[1],data[2],data[3]
    
    tmp = {}
    
    start = time.time()
    model.fit(X,Y)
    
    ypred = model.predict(x)
    
    print(classification_report(y_test, ypred))
    
    plot_confusion_matrix(model, x, y)
    
    tmp['train_score'] = model.score(X,Y)
    tmp['test_score'] = model.score(x,y)
    tmp['Accuracy_score'] = accuracy_score(y, ypred)
    tmp['Precision_score'] = precision_score(y, ypred)
    tmp['Recall_score'] = recall_score(y, ypred)
    tmp['f1_score'] = f1_score(y, ypred)
    tmp['Time'] = time.time() - start
    
    results = pd.DataFrame(tmp, index=[0])
    
    return results