# Importing the required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import shap
import xgboost

# Importing the required train and test dataset

In [None]:
df_train=pd.read_csv("../input/novartis-data/Train.csv")
df_test=pd.read_csv("../input/novartis-data/Test.csv")

In [None]:
df_train.head()

## Looking for missing values

In [None]:
df_train.isnull().sum()

### Feature X_12 has missing 182 values

In [None]:
df_test.isnull().sum()

### Feature X_12 is also missing in test dataset

In [None]:
plt.figure(figsize=(20,18))
df_train.iloc[:,2:-1].boxplot()

#### Looking at all the features it is evident that there is a high presence of outliers in all the features , incident ID and Date has been removed from the data

In [None]:
corr = df_train.iloc[:,2:-1].corr()
corr.style.background_gradient(cmap='coolwarm')

#### Looking at the plot it is evident that X_2 and X_3 are highly correlated, and X_12 and X_10 are highly correlated.

In [None]:
from sklearn.impute import KNNImputer

### Using KNN imputer to impute missing values

In [None]:
k=int(round(len(df_train)**0.5,0))
if k%2==0:
    k=k+1
k

### Identifying square root of the number of observations and making it odd

In [None]:
imputer = KNNImputer(n_neighbors=k)

In [None]:
X=df_train.iloc[:,2:-1].values
Y=df_train.iloc[:,-1].values
x_test=df_test.iloc[:,2:].values

#### Using selected features for classification.

In [None]:
X=imputer.fit_transform(X)
x_test=imputer.transform(x_test)

### Imputing the dataset with KNN impute for test as well as train


In [None]:
from sklearn.preprocessing import StandardScaler

#### Importing Standard Scaler for pre processing of Data

In [None]:
sc=StandardScaler()
X=sc.fit_transform(X)
x_test=sc.transform(x_test)


#### Scaling all the features

In [None]:
np.corrcoef(X[:,9],X[:,11])
np.corrcoef(X[:,1],X[:,2])

#### Since the correlation is high we will drop the features X_3 and X_12

In [None]:
X_reformed=np.delete(X,(2,11),axis=1)
x_test_reformed=np.delete(x_test,(2,11),axis=1)

In [None]:
from sklearn.cluster import KMeans

#### Trying to reduce the number of variables using clustering if we can find clusters and then we can classify

In [None]:
iner=[]
count=[]

In [None]:
for i in range(1,8):
    kmeans=KMeans(n_clusters=i)
    kmeans.fit(X_reformed)
    inertia=kmeans.inertia_
    count.append(i)
    iner.append(inertia)
    

#### Looking for ideal number of clusters,since the ideal number of clusters cannot be determined that is why we sack this idea.

In [None]:
count=np.array(count)
iner=np.array(iner)

In [None]:
plt.plot(count,iner)

In [None]:
from sklearn.decomposition import PCA
explained_variance=[]
count1=[]

#### Trying to reduce the number of features using a scree plot

In [None]:
pca=PCA(n_components=10)
pca.fit(X_reformed)
explained_variance1=pca.explained_variance_ratio_


In [None]:
plt.plot(explained_variance1)

#### Since the number of components cannot be determined using a scree plot, we have to move to classification algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

# Lets look at the balance of the training set

In [None]:
p=sum(Y)/len(Y)
print(p)

#### Since it is highly imbalanced we have to resample the dataset

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X_reformed, Y)

#### Checking the balance again

In [None]:
p=sum(Y_resampled)/len(Y_resampled)
print(p)

##### Now the dataset has been balanced

In [None]:
lr.fit(X_resampled,Y_resampled)

In [None]:
y=lr.predict(X_resampled)

In [None]:
from sklearn.metrics import accuracy_score, auc, confusion_matrix,f1_score, roc_curve, roc_auc_score

In [None]:
confusion_matrix(Y_resampled,y)

##### Creating the confusion matrix

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
scores = cross_val_score(lr,X_resampled, Y_resampled, cv=10, scoring='recall')

In [None]:
scores

##### The recall is 87% which can be improved.

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_resampled, y)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='logistic')
display.plot()  
plt.show() 

#### The AUC curve shows 88% coverage.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

#### Going for Randomized Search to find optimum paprameters for Random Forest

In [None]:
param= {
    'bootstrap': [True],
    'max_depth': [3,4,5,6,7,8],
    'max_features': [5,6,7,8,9,10],
    'min_samples_leaf': [5,6,7,8,9,10],
    'min_samples_split': [20,25,50],
    'n_estimators': [500,1000],
    'criterion':["gini","entropy"]
}

In [None]:
random=RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=param,n_iter=10,cv=3,n_jobs=-1)
random.fit(X_resampled,Y_resampled)

In [None]:
search=random.fit(X_resampled,Y_resampled)
search.best_params_

#### It is clear that n_estimators should be greater that 500 trees

In [None]:
y=random.predict(X_resampled)

In [None]:
scores = cross_val_score(random,X_resampled, Y_resampled, cv=10, scoring='recall',n_jobs=-1)
scores

###### Cross Validation Score is 97% which can be further improved

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_resampled, y)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='randomforest')
display.plot()  
plt.show()  

In [None]:
from sklearn.ensemble import AdaBoostClassifier



#### Using Ada boost Classifier since we know that n_estimators should be greater than 500, therefore using 1000 trees

In [None]:
ada=AdaBoostClassifier(n_estimators=1000)

In [None]:
ada.fit(X_resampled,Y_resampled)

In [None]:
scores = cross_val_score(ada,X_resampled, Y_resampled, cv=10, scoring='recall',n_jobs=-1)
scores

##### Best recall till now using Adaboost Classifier

In [None]:
y=ada.predict(X_resampled)
fpr, tpr, thresholds = metrics.roc_curve(Y_resampled, y)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,estimator_name='Adaboost')
display.plot()  
plt.show() 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#### Using Gradient Boosting algorithm to check if it is greater than ada boost.

In [None]:
gbc=GradientBoostingClassifier(n_estimators=1000)
gbc.fit(X_resampled,Y_resampled)

In [None]:
scores = cross_val_score(gbc,X_resampled, Y_resampled, cv=10, scoring='recall',n_jobs=-1)
scores

#### Since the average score of Adaboost Classifier is greater than Gradient Boost Classifier we will go for Adaboost Classifier.

In [None]:
y=ada.predict(x_test_reformed)
xcv={"INCIDENT_ID":df_test.iloc[:,0],"MULTIPLE_OFFENSE":y}
sample=pd.DataFrame(xcv)
sample.to_csv("Sample.csv",index=False)

### Using adaboost to predict test case