# Classification modeling using Class Variable and dropping price_var variable


In [None]:
import pandas as pd
import os
import pandas_profiling
from pandas_profiling import ProfileReport
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import silhouette_score, silhouette_samples
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [None]:
df1=pd.read_csv("../input/200-financial-indicators-of-us-stocks-20142018/2014_Financial_Data.csv")
df2=pd.read_csv("../input/200-financial-indicators-of-us-stocks-20142018/2015_Financial_Data.csv")
df3=pd.read_csv("../input/200-financial-indicators-of-us-stocks-20142018/2016_Financial_Data.csv")
df4=pd.read_csv("../input/200-financial-indicators-of-us-stocks-20142018/2017_Financial_Data.csv")
df5=pd.read_csv("../input/200-financial-indicators-of-us-stocks-20142018/2018_Financial_Data.csv")

In [None]:
df1['Year']=2014
df2['Year']=2015
df3['Year']=2016
df4['Year']=2017
df5['Year']=2018

In [None]:
df1.columns

In [None]:
df1.info()

In [None]:
### Drop the Stock Column
df1 = df1.drop(df1.columns[0], axis = 1)
df2 = df2.drop(df2.columns[0], axis = 1)
df3 = df3.drop(df3.columns[0], axis = 1)
df4 = df4.drop(df4.columns[0], axis = 1)
df5 = df5.drop(df5.columns[0], axis = 1)

In [None]:
### Convert Sector to Numeric
#df1 = pd.get_dummies(df1,columns=['Sector'],dtype= 'int64')
#df2 = pd.get_dummies(df2,columns=['Sector'],dtype= 'int64')
#df3 = pd.get_dummies(df3,columns=['Sector'],dtype= 'int64')
#df4 = pd.get_dummies(df4,columns=['Sector'],dtype= 'int64')
#df5 = pd.get_dummies(df5,columns=['Sector'],dtype= 'int64')

In [None]:
### Change Data Type
df1['Class'] = df1['Class'].astype(object)
df2['Class'] = df2['Class'].astype(object)
df3['Class'] = df3['Class'].astype(object)
df4['Class'] = df4['Class'].astype(object)
df5['Class'] = df5['Class'].astype(object)

In [None]:
df1.rename(columns={"2015 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df2.rename(columns={"2016 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df3.rename(columns={"2017 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df4.rename(columns={"2018 PRICE VAR [%]": "PRICE_VAR"},inplace=True)
df5.rename(columns={"2019 PRICE VAR [%]": "PRICE_VAR"},inplace=True)


In [None]:
df1.drop(['Sector'], axis=1, inplace=True)
df2.drop(['Sector'], axis=1, inplace=True)
df3.drop(['Sector'], axis=1, inplace=True)
df4.drop(['Sector'], axis=1, inplace=True)
df5.drop(['Sector'], axis=1, inplace=True)

Impute missing values using KNN Imputer

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=20, weights='distance', metric='nan_euclidean', copy=True)

In [None]:
df1_clean = imputer.fit_transform(df1)
df1_clean = pd.DataFrame(df1_clean)
df1_clean.columns = list(df1)

In [None]:
df2_clean = imputer.fit_transform(df2)
df2_clean = pd.DataFrame(df2_clean)
df2_clean.columns = list(df2)


In [None]:
df3_clean = imputer.fit_transform(df3)
df3_clean = pd.DataFrame(df3_clean)
df3_clean.columns = list(df3)

In [None]:
df4_clean = imputer.fit_transform(df4)
df4_clean = pd.DataFrame(df4_clean)
df4_clean.columns = list(df4)

In [None]:
df5_clean = imputer.fit_transform(df5)
df5_clean = pd.DataFrame(df5_clean)
df5_clean.columns = list(df5)

In [None]:

##### Check Missing Value Again
df1_clean.isnull().sum()
df2_clean.isnull().sum()
df3_clean.isnull().sum()
df4_clean.isnull().sum()
df5_clean.isnull().sum()

In [None]:
###### Concatenate
data = pd.concat([df1_clean, df2_clean, df3_clean, df4_clean, df5_clean])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
# Data missing information 
data_info=pd.DataFrame(data.dtypes).T.rename(index={0:'column type'})
data_info=data_info.append(pd.DataFrame(data.isnull().sum()).T.rename(index={0:'null values (nb)'}))
data_info=data_info.append(pd.DataFrame(data.isnull().sum()/data.shape[0]*100).T.
                         rename(index={0:'null values (%)'}))
display(data_info)

In [None]:
data.dtypes

In [None]:
# Train Year: 2014 - 2016
# Test Year:  2017 - 2018
all_year = set(data['Year'].unique())
test_year = set(range(2017,2018,1))
train_year = all_year - test_year

len(train_year), len(test_year), len(all_year)

train = data[data['Year'].isin(train_year)]
test = data[data['Year'].isin(test_year)]

train['Class'].value_counts()

class_ratio = len(train[train['Class']==1]) / len(train.index)
class_ratio

len(test) / len(data)
len(train) / len(data)

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(data1, data1['Class'], test_size=0.3, random_state=42)
data.drop(['Year','PRICE_VAR'], axis=1, inplace=True)
train.drop(['Year','PRICE_VAR'], axis=1, inplace=True)
test.drop(['Year','PRICE_VAR'], axis=1, inplace=True)

In [None]:
test2=test.copy()

In [None]:
test2.drop('Class', axis=1,inplace=True)

In [None]:
test.columns

In [None]:
test2.columns

In [None]:
data.columns

In [None]:
data['Class']= data['Class'].astype('int64')
train['Class']= train['Class'].astype('int64')
test['Class']= test['Class'].astype('int64')

In [None]:
!pip install pycaret
from pycaret.classification import *
classification_setup = setup(data= train, target='Class',remove_outliers=True,normalize=True, remove_multicollinearity=True ,normalize_method='robust',
                            feature_selection=True ,silent = True,session_id = 6563)

In [None]:
compare_models(sort = 'AUC')

In [None]:
cb_model = create_model('catboost')

In [None]:
interpret_model(cb_model)

In [None]:
pred_cb = predict_model(cb_model, data=test2)

In [None]:
best=automl(use_holdout=True)

In [None]:
best

In [None]:
cb_model

In [None]:
lgbm_model = create_model('lightgbm')

In [None]:
evaluate_model(lgbm_model)

![](http://)# Neural Networks since that is not included in pycaret 

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
X_train=train.drop('Class', axis=1)

In [None]:
Y_train=train['Class']

In [None]:
clf = MLPClassifier(random_state=1, hidden_layer_sizes=150,activation='logistic', solver='lbfgs',alpha=0.2, learning_rate='adaptive',max_iter=300).fit(X_train, Y_train)

In [None]:
pred=clf.predict(test2)

In [None]:
confusion_matrix(pred, test['Class'])

In [None]:
from sklearn.metrics import roc_auc_score, classification_report
roc_auc_score(test['Class'], pred)

In [None]:
print(classification_report(pred,test['Class']))

In [None]:
#Best Model Catboost Classifier