In [None]:
#importing the required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xg
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
#Data Collection
train = pd.read_csv("/kaggle/input/novartis-data/Train.csv")

In [None]:
print("shape of data set: ", train.shape)
train.head(10)

## Checking for Null Values

In [None]:
nullcnt = train.isnull().sum()
NullValues=train.isnull().sum()/len(train)
frame = {'Number of Null values': nullcnt, 'percentage of null values': NullValues}
pd.DataFrame(frame)

## Handling Data Types

In [None]:
train.dtypes

In [None]:
#convert date dtype from object to date datatype
train.DATE = pd.to_datetime(train.DATE)
print("Dtype of date column: ", train.DATE.dtypes)
print(train.DATE.head(10))


In [None]:
# Creating an additional columns with date for analyss
def datetimesplit(df):
    df['date'] = df['DATE'].dt.month
    df['DAY'] = df['DATE'].dt.day
    df['YEAR'] = df['DATE'].dt.year
    df['DAYOFWEEK'] = df['DATE'].dt.dayofweek
    df['WEEKEND'] = np.where(df['DATE'].dt.day_name().isin(['Sunday','Saturday']),1,0)
    df = df.drop('DATE', axis = 1)
    return df

In [None]:
train = datetimesplit(train)
train.head(2)

## Handling the null values

##### X_12 contains 0.7% null values. We can handle it by considering X_12 column as independent column and rest are dependent column


In [None]:
train_miss = train[train.X_12.isnull()== False].drop(['X_12','INCIDENT_ID'], axis = 1)
train_y = train[train.X_12.isnull()== False].X_12
pred_miss = train[train.X_12.isnull()== True].drop(['X_12','INCIDENT_ID'], axis = 1)

In [None]:
print("shape: ", train_miss.shape)
print("shape: ", train_y.shape)
print("shape: ", pred_miss.shape)

In [None]:
clf = GradientBoostingClassifier()
clf.fit(train_miss, train_y)
y_pred= clf.predict(pred_miss)

In [None]:
pred_miss = train[train.X_12.isnull()== True]
pred_miss.X_12 = y_pred

In [None]:
pd.set_option('display.max_columns', None)
pred_miss

train_miss = train[train.X_12.isnull()== False]

train_file = train_miss.append(pred_miss).sort_index()
null_col = [i for i in train_file.columns if train_file[i].isnull().sum() > 0]

In [None]:
train_file.head(3)

## EDA (Exploratory data Analysis)
- In statistics, exploratory data analysis (EDA) is an approach to analyzing data sets to summarize their main characteristics, often with visual methods. A statistical model can be used or not, but primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing task.

In [None]:
import plotly.express as px
import plotly.graph_objects as go
multiple_offence_tab = train_file.groupby('MULTIPLE_OFFENSE').agg({'INCIDENT_ID':'count'}).reset_index()
colours = ['#00CC96','#636EFA']
fig = go.Figure(data=[go.Bar(x = multiple_offence_tab.MULTIPLE_OFFENSE, y = multiple_offence_tab.INCIDENT_ID, marker_color = colours)])
fig.update_layout(title_text='Multple Offense are very high',autosize=False, width=550,height=450)

In [None]:
weekcount = train_file.groupby(['WEEKEND','MULTIPLE_OFFENSE']).agg({'INCIDENT_ID':'count'}).reset_index()
weekcount['name'] = np.where(weekcount.WEEKEND == 0 , 'Weekday','Weekend')
weekcount['Percentage'] = (weekcount['INCIDENT_ID']/sum(weekcount['INCIDENT_ID']) *100).round(2).astype(str) + '%'
fig = px.bar(weekcount, x = 'MULTIPLE_OFFENSE', y = 'INCIDENT_ID',  barmode = 'group', color = 'name', text = 'Percentage',width=550,height=450)
fig.show()

In [None]:
df = train_file.groupby(['YEAR']).agg({'MULTIPLE_OFFENSE':'sum'}).reset_index()
fig = px.line(df, x='YEAR', y='MULTIPLE_OFFENSE', width=550,height=450)
fig.show()

In [None]:
df = train_file.groupby('DAYOFWEEK').agg({'MULTIPLE_OFFENSE':'sum'}).reset_index()
df['weekname'] = df.DAYOFWEEK.map({0:'Monday', 1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',
                                  5:'Saturday',6:'Sunday',})
fig = px.pie(df, values = 'MULTIPLE_OFFENSE', names = 'weekname')
fig.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
Numerical_features=['X_1','X_2','X_3','X_4','X_5','X_6','X_7','X_8','X_9','X_10','X_11','X_12', 'X_13','X_14','X_15']
fig, ax = plt.subplots(5, 3, figsize=(20, 10))
for variable, subplot in zip(Numerical_features, ax.flatten()):
    sns.distplot(train_file[variable], ax=subplot)

## SMOTE UPSAMPLING
#### Dataset is imbalanced, so i'm using SMOTE upsamping techniques to make it balanced
- SMOTE first selects a minority class instance a at random and finds its k nearest minority class neighbors. The synthetic instance is then created by choosing one of the k nearest neighbors b at random and connecting a and b to form a line segment in the feature space. The synthetic instances are generated as a convex combination of the two chosen instances a and b.

In [None]:
train_Ads = train_file.drop(['INCIDENT_ID','MULTIPLE_OFFENSE'], axis = 1)
train_y = train_file['MULTIPLE_OFFENSE']

In [None]:
#!pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
train_Ads, train_y = oversample.fit_resample(train_Ads, train_y)
len(train_Ads),len(train_y)

In [None]:
# Spliting data for training the model. Splitting the data will be done at the begining of feature seletion phase

X_train, X_test, y_train, y_test = train_test_split(train_Ads, train_y,test_size=0.20, random_state= 42)

In [None]:
X_train.head(2)

## Feature Scaling
- Scaled the data for each metrics by using feature scaling techniques to reduce the bias, to normalize the data within a range and speeding up the calculation while training the model. After applying the Standard Scaler, data range is in between -3 to 3


In [None]:
from sklearn.preprocessing import StandardScaler
def scaling_train(df):
    StandardScale = StandardScaler()
    global X_train_ADS
    X_train_ADS = pd.DataFrame(StandardScale.fit_transform(df),columns =  X_train.columns)
    return X_train_ADS

def scaling_test(df):
    StandardScale = StandardScaler()
    global X_test_ADS
    X_test_ADS = pd.DataFrame(StandardScale.fit_transform(df),columns =  X_train.columns)
    return X_test_ADS

In [None]:
scaling_train(X_train)
scaling_test(X_test)

## Feature Selection

### Removing the Multicolinearity in dataset

In [None]:
heat = X_train_ADS.corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(heat, annot = True)

## CHI Square method to select features
#### By using "CHI Square test", selecting the features with less P-values and high F-values i.e dependent feature is more dependent on those independent features  

In [None]:
# Dropping th multi colinearity columns
X_train_ADS = X_train_ADS.drop(['X_3','X_7','X_12','DAYOFWEEK'],axis = 1)
from sklearn.feature_selection import chi2
f_p_values=chi2(X_train[X_train_ADS.columns],y_train)
f_p_values

In [None]:
#Seecting the Top 10 columns ehich having less P values and more F value. Those columns we connsider for training the data
k = pd.Series(f_p_values[1])
k.index  = X_train_ADS.columns
l = pd.Series(f_p_values[0])
l.index  = X_train_ADS.columns
frame = {'Pvalue':k, 'Fvalue':l}
Top10cols = pd.DataFrame(frame).sort_values('Pvalue', ascending = True).head(10).index
Top10cols

In [None]:
X_train_ADS = X_train_ADS[Top10cols]
X_test_ADS = X_test_ADS[Top10cols]
X_train_ADS.shape, X_test_ADS.shape

## Creating a pipeline for Machine Learining models which requires feature scaling

In [None]:
model_names = {

    "Decision Tree Classfier":
    {
        "model": DecisionTreeClassifier()
    },
    "Random Forest Classfier":
    {
        "model": RandomForestClassifier()
    },
    "XgBoost Classfier":
    {
        "model": GradientBoostingClassifier()
    },
    "Logestic Regresssion":
    {
        "model": LogisticRegression()
    },
    "KNN Classfier":
    {
        "model": KNeighborsClassifier()
    },
    "SVC":
    {
        "model" : SVC()
    }
}

In [None]:
Model_Name = []
Model_Acc = []
for model_name, mn in model_names.items():
    model = mn['model']
    model.fit(X_train_ADS, y_train)
    y_pred= model.predict(X_test_ADS)
    print(model_name)
    print(classification_report(y_test, y_pred))
    print("Confusin Matrix: \n",confusion_matrix(y_test, y_pred))
    print("Accuracy: \t",accuracy_score(y_test, y_pred))
    print('\n \n')
    Model_Name.append(model_name)
    Model_Acc.append(accuracy_score(y_test, y_pred))
    

In [None]:
ModelName_Accuracy = dict(zip(Model_Name,Model_Acc))
ModelName_Accuracy

- We are obtaining good accuracy with KNN Classifier and Random Forest. If you are giving more importance to False Positive we can go with Ranfom Forest Classifier else KNN Classifier is preferred
- If you like the notebook, Please Upvote add up your comments/Suggestions to this nootebook ..... Happy coding :)
