In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing libraries.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy import stats
from sklearn.metrics import accuracy_score, roc_curve, auc, mean_squared_error, f1_score
import warnings
warnings.filterwarnings('ignore')

# Reading the data.

In [None]:
!ls ../input/data-science-london-scikit-learn

In [None]:
train = pd.read_csv("../input/data-science-london-scikit-learn/train.csv", header = None)
test = pd.read_csv("../input/data-science-london-scikit-learn/test.csv", header = None)
train_label = pd.read_csv("../input/data-science-london-scikit-learn/trainLabels.csv", header = None)

# Preprocessnig

In [None]:
train_label.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print("Shape of train dataframe is {}".format(train.shape))
print("Shape of test dataframe is {}".format(test.shape))
print("Shape of train label dataframe is {}".format(train_label.shape))

In [None]:
print("Null value in train dataframe is {}".format(train.isnull().sum().any()))
print("Null value in test dataframe is {}".format(test.isnull().sum().any()))
print("Null value in train_label dataframe is {}".format(train_label.isnull().sum().any()))

In [None]:
train['target'] = train_label[0]
train.head()

In [None]:
# Findning duplicate rows.

dupli = train[train.duplicated()]
dupli

In [None]:
# Removing all the columns having only 1 unique value.

for i in train.columns:
    if train[i].nunique() == 1:
        print(i)
        train.drop(i, inplace = True, axis = 1)
        test.drop(i, inplace = True, axis = 1)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
# Before removing Outliers.

plt.figure(figsize = (20,20))
for i in range (len(train.columns)):
    plt.subplot(5, 10, i+1)
    sns.boxplot(train.iloc[:,i])
    plt.xlabel(train.columns[i], size = 10)

In [None]:
def Outliers(data, ft):
    
    IQ1 = data[ft].quantile(0.25)
    IQ3 = data[ft].quantile(0.75)
    IQR = IQ3 - IQ1
    
    lower_bound = IQ1 - 1.5 * IQR
    upper_bound = IQ3 + 1.5 * IQR
    
    index = data.index[ (data[ft] < lower_bound) | (data[ft] > upper_bound) ]
    return index

In [None]:
index = []
for i in train.columns:
    index.extend(Outliers(train, i))
index = list(set(index))
len(index)

In [None]:
# print("Size of training data before removing outliers is {}".format(train.shape))
# train.drop(index, inplace = True, axis = 0)
# print("Size of training data after removing outliers is {}".format(train.shape))

In [None]:
# # After removing Outliers.

# plt.figure(figsize = (20,20))
# for i in range (len(train.columns)):
#     plt.subplot(5, 10, i+1)
#     sns.boxplot(train.iloc[:,i])
#     plt.xlabel(train.columns[i], size = 10)

In [None]:
# Count plot for target column.

print(train['target'].value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = train['target'])
plt.xlabel('Target', size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution in target column before resmpling', size = 12)

In [None]:
# Applying Standard Scaler.

x = train.drop('target', axis = 1)
col = x.columns
std = StandardScaler()
x_std = std.fit_transform(x)
x = pd.DataFrame(data = x_std, columns = col)

x.head()

In [None]:
y = train.loc[:, 'target']
y.head()

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 42)

# Building models.

In [None]:
def Models(model, name, d, xtrain, ytrain, xtest, ytest):
    print("Working on {} model".format(name))
    
    cla = model
    cla.fit(xtrain, ytrain)
    
    predicted = cla.predict(xtrain)
    tr_auc = accuracy_score(predicted, ytrain)*100
    
    predicted = cla.predict(xtest)
    te_auc = accuracy_score(predicted, ytest)*100
    
    F1 = f1_score(predicted, ytest)
    MSE = mean_squared_error(predicted, ytest)
    
    d['Name'].append(name)
    d['Training ACU'].append(tr_auc)
    d['Testing ACU'].append(te_auc)
    d['F1_Score'].append(F1)
    d['MSE'].append(MSE)
    
    print("**********"*5)
    print()
    return d

In [None]:
d = {'Name' : [], 'Training ACU': [], 'Testing ACU': [], 'F1_Score': [], 'MSE': []}
    
models = [ 
    [RandomForestClassifier(n_estimators = 200), 'Random Forest'], [DecisionTreeClassifier(), 'Decision Tree'], 
          [XGBClassifier(tree_method='gpu_hist'), 'XGBoost'], 
          [CatBoostClassifier(task_type="GPU"), 'CatBoost'], [GaussianNB(), 'Naive Bayes'], 
          [LogisticRegression(), 'Logistic Regression'], [LGBMClassifier(device_type = 'GPU'), 'Light GBM'] 
]
for model in models:
    d = Models(model[0], model[1], d, xtrain, ytrain, xtest, ytest)

# Predciting on test data.

In [None]:
acu_data = pd.DataFrame(data = d)
acu_data

In [None]:
cla = LGBMClassifier(device_type = 'GPU')
cla.fit(xtrain, ytrain, eval_set = (xtest, ytest), verbose = 10)

In [None]:
params = {
    "n_estimators": [i for i in range (10, 800, 10)], 
    "max_depth": [i for i in range (2, 40, 2)],
    "min_samples_leaf" : [i for i in range (15, 40)],
    "min_samples_split" : [i for i in range (5,240)]
}
rcla = RandomForestClassifier()
rcla

In [None]:
grid = RandomizedSearchCV(estimator = rcla, param_distributions = params, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid.fit(xtrain, ytrain)

In [None]:
best_param_rf = grid.best_params_
best_param_rf

In [None]:
grid.best_score_

In [None]:
rcla = RandomForestClassifier(**best_param_rf)
rcla.fit(xtrain, ytrain)

In [None]:
predicted = rcla.predict(xtrain)
train_score = accuracy_score(predicted, ytrain)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted = rcla.predict(xtest)
test_score = accuracy_score(predicted , ytest)*100
f1 = f1_score(ytest, predicted)
print("F1 Score using Random Forest testing data is {} %".format(f1))

In [None]:
param_grid = {'n_estimators': [i for i in range (90, 210, 10)],
              'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'eval_metric': ['mlogloss', 'accuracy'],
             'booster' : ['gbtree', 'gblinear'],
             'verbosity': [0, 1, 2, 3],
             'validate_parameters': [True, False]
             }
xcla = XGBClassifier(tree_method='gpu_hist')
xcla

In [None]:
grid = RandomizedSearchCV(estimator = xcla, param_distributions = param_grid, cv = 5, scoring = 'accuracy', n_jobs = -1)
grid.fit(xtrain, ytrain)

In [None]:
print("Randomized Search best Score is {}".format(grid.best_score_))

In [None]:
best_parameters_xg = grid.best_params_
best_parameters_xg

In [None]:
xcla = XGBClassifier(**best_parameters_xg)
xcla.fit(xtrain, ytrain)

In [None]:
predicted = rcla.predict(xtrain)
train_score = accuracy_score(predicted, ytrain)*100
print("Accuracy using Random Forest on training data is {} %".format(train_score))

In [None]:
predicted = rcla.predict(xtest)
test_score = accuracy_score(predicted , ytest)*100
f1 = f1_score(ytest, predicted)
print("F1 Score using Random Forest testing data is {} %".format(f1))

In [None]:
test.head()

In [None]:
col = test.columns
std = StandardScaler()
x_std = std.fit_transform(test)
x_test = pd.DataFrame(data = x_std, columns = col)


predicted = cla.predict(x_test)
predicted

In [None]:
l = [i for i in range (1, len(predicted)+1)]
len(l)

In [None]:
submit = pd.DataFrame(data = {'Id': l, 'Solution': predicted})
submit

In [None]:
submit.to_csv('Submission1.csv', index = False)
submit.head()