In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV





# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aryaai-dataset/Assignment - Data Scientist (1).docx
/kaggle/input/aryaai-dataset/Arya_DataScientist_Assignment/test_set.csv
/kaggle/input/aryaai-dataset/Arya_DataScientist_Assignment/training_set.csv


**Notebook should contain :**
1. EDA, Feature selection, preprocessing
2. Model performance analysis in terms of validation and risks involved
3. model predictions for test dataset
4. write the dependencies, libraries in seperate python file
5. Readme file - approach to solve prob, thought process

**Task steps:**
1. Split the train set into train and validation in 4:1 ratio
2. Explain model selection. Apply classification model 
3. Evaluate model accuracy



## Approach:

Notebook 1: 

1. Load and split the data 

2. Standardize and Pipeline tree-based algorithms to handle data imbalances

3. Obtain algorithms with highest accuracy

4. Tune the algorithm and find best parameters

Notebook 2:

1. Explore the dataset 
    - Check data relations
    - Data correlations
    - Missing values
    - Outliers
    - Different data types
    - Fix data distribution skewness, kurtosis
    - Fix outliers
    - Scale, undersample the data
    
2. Split the cleaned data into 3 sets ( Test, Train, validation set)

3. Utilise 5 fold validation and compare accuracy / recall / roc-auc scores for train, test and validation sets 

4. Take the parameters and model from notebook1. Integrate data with this model.

5. Plot confusion matrix, roc-auc curves and expected-actual prediction


## Load the data

In [2]:
train_df = pd.read_csv('../input/aryaai-dataset/Arya_DataScientist_Assignment/training_set.csv')
test_df = pd.read_csv('../input/aryaai-dataset/Arya_DataScientist_Assignment/test_set.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3910 entries, 0 to 3909
Data columns (total 59 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  3910 non-null   int64  
 1   X1          3910 non-null   float64
 2   X2          3910 non-null   float64
 3   X3          3910 non-null   float64
 4   X4          3910 non-null   float64
 5   X5          3910 non-null   float64
 6   X6          3910 non-null   float64
 7   X7          3910 non-null   float64
 8   X8          3910 non-null   float64
 9   X9          3910 non-null   float64
 10  X10         3910 non-null   float64
 11  X11         3910 non-null   float64
 12  X12         3910 non-null   float64
 13  X13         3910 non-null   float64
 14  X14         3910 non-null   float64
 15  X15         3910 non-null   float64
 16  X16         3910 non-null   float64
 17  X17         3910 non-null   float64
 18  X18         3910 non-null   float64
 19  X19         3910 non-null  

## EDA

In [4]:
x = train_df.drop('Y',axis=1)
y = train_df['Y']

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y,train_size=0.33)


In [6]:
print(xtrain.shape,'\n',xtest.shape,'\n',ytrain.shape,'\n',ytest.shape)

(1290, 58) 
 (2620, 58) 
 (1290,) 
 (2620,)


Let's try out the logistic regression algorithm with a pipeline to see how well the numerical data fits to a curve.

In [7]:


pipeline = Pipeline([('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', LogisticRegression())])
scores = cross_validate(pipeline, xtrain, ytrain)
# print(scores)
print(scores['test_score'].mean())

0.9069767441860466


The Avg test accuracy of logistic regression is 90.31%.

Curious to checkout the score of other classification algorithms.

## Pipelining classification algorithms 

We want to find this algorithm and improve its accuracy by cross-validating. 

> - Dtree
> - GBDT
> - SVM
> - LogReg
> - KNN

In [8]:

cl = []
cl.append(DecisionTreeClassifier())
cl.append(RandomForestClassifier())
cl.append(GradientBoostingClassifier())
cl.append(LogisticRegression())
cl.append(SVC())

pipeline = Pipeline([('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', LogisticRegression())])
pipeline.steps
for c in cl:
    pipeline.set_params(clf = c)
    scores = cross_validate(pipeline, xtrain,ytrain)
    print('_'*25)
    print(str(c))
    print('_'*25)
    print(scores['test_score'].mean())
#     for k, v in scores.items():
#         print(k,'mean',v.mean())
#         print(k,'std',v.std())
#     print('_'*25)
    
    



_________________________
DecisionTreeClassifier()
_________________________
0.8852713178294573
_________________________
RandomForestClassifier()
_________________________
0.9364341085271318
_________________________
GradientBoostingClassifier()
_________________________
0.9395348837209303
_________________________
LogisticRegression()
_________________________
0.9069767441860466
_________________________
SVC()
_________________________
0.9178294573643411


The GradientBoostingClassifier() has highest accuracy of 94.57% 

We cross validate and perform parameter tuning to find the best parameters for GBC 

## Parameter-tuning

In [9]:
pipeline.set_params(clf= GradientBoostingClassifier())
pipeline.steps

[('normalizer', StandardScaler()), ('clf', GradientBoostingClassifier())]

In [10]:
parameters = {
    "loss": ["deviance"],
    "learning_rate": [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1], 
    "n_estimators": [200, 350, 500, 750],
    "max_depth": [3, 6, 8]
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["mae"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
    }
cvgrid = GridSearchCV(GradientBoostingClassifier(),
                      param_grid = parameters,cv=4,scoring='accuracy')
cvgrid.fit(xtrain,ytrain)
print('Train Score',cvgrid.score(xtrain,ytrain))
print('Test Score',cvgrid.score(xtest,ytest))
print('_'*25)
print('Best parameters',cvgrid.best_params_)


Train Score 1.0
Test Score 0.9408396946564885
_________________________
Best parameters {'learning_rate': 0.5, 'loss': 'deviance', 'max_depth': 3, 'n_estimators': 350}


In [11]:
ypred = cvgrid.predict(xtest)
acc = accuracy_score(ytest,ypred)
print('Accuracy after cv is ',acc)

Accuracy after cv is  0.9408396946564885


In [12]:
result = pd.DataFrame({'GBC pred':ypred})
# test_df['Y'] = result
# result = test_df[['Unnamed: 0','Y']]
result.to_csv('result_gbc.csv')


In [13]:
hyperparameters = {
    'n_estimators'      : list(range(10, 50, 10)),
    'max_features'      : ['auto', 'sqrt', 'log2'],
    'criterion'         : ['gini', 'entropy'],
    'max_depth'         : [None, 1, 2, 3, 4, 5],
    'min_samples_split' : list(range(2,5)),
    'min_samples_leaf'  : list(range(1,5))
}

cvgrid2 = GridSearchCV(RandomForestClassifier(random_state=1),
                      param_grid = hyperparameters,cv=4,scoring='accuracy')
cvgrid2.fit(xtrain,ytrain)
print('Train Score',cvgrid2.score(xtrain,ytrain))
print('Test Score',cvgrid2.score(xtest,ytest))
print('_'*25)
print('Best parameters',cvgrid2.best_params_)
pred2 = cvgrid2.predict(xtest)
acc = accuracy_score(ytest,pred2)
print('Accuracy after cv is ',acc)


Train Score 0.9961240310077519
Test Score 0.9396946564885497
_________________________
Best parameters {'criterion': 'entropy', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 30}
Accuracy after cv is  0.9396946564885497


In [14]:
output = pd.DataFrame({'RF Predictions':pred2})
output.to_csv('RF.csv',index=False)