In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import scale
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve

* Group --> Class
* Age --> Age
* EDUC --> Years of Education
* SES --> Socioeconomic Status / 1-5
* MMSE --> Mini Mental State Examination
* CDR --> Clinical Dementia Rating
* eTIV --> Estimated total intracranial volume
* nWBV --> Normalize Whole Brain Volume
* ASF --> Atlas Scaling Factor

# DATA

In [None]:
AlzheimerData = pd.read_csv("../input/alzheimer-features/alzheimer.csv")
data = AlzheimerData.copy() # for VISUALIZATION
data["Group"] = pd.Categorical(data["Group"])
data["M/F"] = pd.Categorical(data["M/F"])
data["SES"] = pd.Categorical(data["SES"])
data["CDR"] = pd.Categorical(data["CDR"])
data["EDUC"] = pd.Categorical(data["EDUC"])
data["Age"] = pd.Categorical(data["Age"])

df = data.select_dtypes(include=["float64","int64","int32"])

# INFORMATIONS

In [None]:
print(data.shape)
print("-."*40)
print(data.columns)
print("-."*40)
print(data.info())
print("-."*40)
print(data.describe().T)
print("-."*40)
print(data.groupby(["Group","SES"])["MMSE"].mean())
print("-."*40)
print(data.groupby(["Group","SES"])["eTIV"].mean())
print("-."*40)
print(data.groupby(["Group","SES"])["nWBV"].mean())
print("-."*40)
print(data.groupby(["Group","SES"])["ASF"].mean())
print("-."*40)
print(data.groupby(["Group","CDR"])["MMSE"].mean())
print("-."*40)
print(data.groupby(["Group","CDR"])["eTIV"].mean())
print("-."*40)
print(data.groupby(["Group","CDR"])["nWBV"].mean())
print("-."*40)
print(data.groupby(["Group","CDR"])["ASF"].mean())
print("-."*40)
print(data["Group"].value_counts())
print("-."*40)
print(data["EDUC"].value_counts())
print("-."*40)
print(data["M/F"].value_counts())
print("-."*40)
print(df.corr())
print("-."*40)
print(data.isnull().sum())
print("-."*40)

# MISSING VALUES VISUALIZATION

In [None]:
msno.heatmap(data)
msno.matrix(data)
plt.show()

# VISUALIZATION

In [None]:
data["Group"].hist(figsize=(5,5))
plt.show()

In [None]:
data["M/F"].hist(figsize=(5,5))
plt.show()

In [None]:
data["SES"].hist(figsize=(5,5))
plt.show()

In [None]:
data["CDR"].hist(figsize=(5,5))
plt.show()

In [None]:
data["Age"].hist(figsize=(5,5))
plt.show()

In [None]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(data["MMSE"], data["eTIV"], data["nWBV"], c="green", s=20, alpha=0.5)
plt.show()

In [None]:
sns.scatterplot(x="SES",y="MMSE",hue="Group",data=data)
plt.show()

In [None]:
sns.scatterplot(x="SES",y="eTIV",hue="Group",data=data)
plt.show()

In [None]:
sns.scatterplot(x="SES",y="nWBV",hue="Group",data=data)
plt.show()

In [None]:
sns.scatterplot(x="SES",y="ASF",hue="Group",data=data)
plt.show()

In [None]:
sns.lineplot(x="SES", y="MMSE",hue="Group", data=data)
plt.show()

In [None]:
sns.lineplot(x="SES", y="eTIV",hue="Group", data=data)
plt.show()

In [None]:
sns.lineplot(x="SES", y="nWBV",hue="Group", data=data)
plt.show()

In [None]:
sns.lineplot(x="SES", y="ASF",hue="Group", data=data)
plt.show()

# CORRELATION VISUALIZATION

In [None]:
Features = ["MMSE","eTIV","nWBV","ASF"]

In [None]:
corrPearson = data[Features].corr(method="pearson")
corrSpearman = data[Features].corr(method="spearman")

In [None]:
fig = plt.figure(figsize=(10,8))
sns.heatmap(corrPearson,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)

plt.title("Pearson Correlation")
plt.show()

In [None]:
fig = plt.figure(figsize=(10,8))
sns.heatmap(corrSpearman,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)

plt.title("Spearman Correlation")
plt.show()

# NORMALITY

In [None]:
for i in Features:
    print(i,"-----------")
    print(shapiro(data[i]))

# HOMOGENEITY

In [None]:
print(levene(data["MMSE"],data["eTIV"],data["nWBV"],data["ASF"]))

# CONVERSION

In [None]:
Columns = ["Group","M/F"]
encode = LabelEncoder()
for i in Columns:
    print(data[i].value_counts())
    print("----")
    data[i] = encode.fit_transform(data[i])
    print(data[i].value_counts())
    print("----"*30)

#Group
* Converted (0)
* Demented (1)
* Nondemented (2)

#M/F
* F (0)
* M (1)

In [None]:
data["SES"] = AlzheimerData["SES"]
data["CDR"] = AlzheimerData["CDR"]
data["EDUC"] = AlzheimerData["EDUC"]
data["Age"] = AlzheimerData["Age"]

print(data.info())

# AGAINIST VALUES

In [None]:
DataForA = data.dropna()
clf = LocalOutlierFactor()
clf.fit_predict(DataForA)

In [None]:
score = clf.negative_outlier_factor_
scoreSort = np.sort(score)
print(scoreSort[0:50])

In [None]:
point = scoreSort[3]
print(DataForA[score == point])

In [None]:
againstvalues = DataForA < point
print(DataForA[againstvalues])

In [None]:
normalvalues = DataForA > point
print(data[normalvalues])

# MISSING VALUES PROCESS

In [None]:
data["SES"].fillna(data["SES"].mean(), inplace=True)
data["MMSE"].fillna(data["MMSE"].mean(), inplace=True)
print(data.isnull().sum())

# X & Y FOR MODELS

In [None]:
x = data.drop("Group",axis=1)
y = data["Group"]

xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size=0.20,random_state=42)


# OLS MODELS & ERROR & TUNING & PREDICT

In [None]:
ols = sm.OLS(yTrain,xTrain).fit()
predict = ols.predict(xTest)
print(ols.summary())
# R2 -- 0.89

# LINEAR MODELS & ERROR & TUNING & PREDICT

In [None]:
lm = LinearRegression().fit(xTrain,yTrain)
predict = lm.predict(xTest)

R2CV = cross_val_score(lm,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.13
errorCV = -cross_val_score(lm,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

# PCR MODELS & ERROR & TUNING & PREDICT

In [None]:
pca = PCA()
xRTrain = pca.fit_transform(scale(xTrain))

lm = LinearRegression().fit(xRTrain,yTrain)
predict = lm.predict(xTest)

R2CV = cross_val_score(lm,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.13
errorCV = -cross_val_score(lm,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

# PLS MODELS & ERROR & TUNING & PREDICT

In [None]:
pls = PLSRegression().fit(xTrain,yTrain)
predict = pls.predict(xTest)

R2CV = cross_val_score(pls,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.13
errorCV = -cross_val_score(pls,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

for i in range(1,20):
    plstuned = PLSRegression(n_components=i).fit(xTrain,yTrain)
    print(f"{i}","--"*20)
    predicttuned = plstuned.predict(xTest)
    R2CVtuned = cross_val_score(plstuned,xTest,yTest,cv=10,scoring="r2").mean()
    print(R2CVtuned)
    # BEST IS 6 -- 0.15
    errorCVtuned = -cross_val_score(plstuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
    print(np.sqrt(errorCVtuned))
    

# RIDGE MODELS & ERROR & TUNING & PREDICT

In [None]:
ridge = Ridge().fit(xTrain,yTrain)
predict = ridge.predict(xTest)

R2CV = cross_val_score(ridge,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(ridge,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

alpha = np.random.uniform(0.1,10,50)

cv = RidgeCV(alphas=alpha,scoring="r2",cv=10,normalize=True).fit(xTrain,yTrain)
print(cv.alpha_)

ridgetuned = Ridge(alpha=cv.alpha_).fit(xTrain,yTrain)
R2CVtuned = cross_val_score(ridgetuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.15
errorCVtuned = -cross_val_score(ridgetuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# LASSO MODELS & ERROR & TUNING & PREDICT

In [None]:
lasso = Lasso().fit(xTrain,yTrain)
predict = lasso.predict(xTest)

R2CV = cross_val_score(lasso,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(lasso,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

cv = LassoCV(alphas=None,max_iter=100000,normalize=True).fit(xTrain,yTrain)
print(cv.alpha_)

lassotuned = Lasso(alpha=cv.alpha_,normalize=True).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(lassotuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.14
errorCVtuned = -cross_val_score(lassotuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# ENET MODELS & ERROR & TUNING & PREDICT

In [None]:
elastic = ElasticNet().fit(xTrain,yTrain)
predict = elastic.predict(xTest)

R2CV = cross_val_score(elastic,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(elastic,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

cv = ElasticNetCV(alphas=None,random_state=0).fit(xTrain,yTrain)
print(cv.alpha_)

elastictuned = ElasticNet(alpha=cv.alpha_).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(elastictuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.15
errorCVtuned = -cross_val_score(elastictuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# KNN MODELS & ERROR & TUNING & PREDICT

In [None]:
knn = KNeighborsRegressor().fit(xTrain,yTrain)
predict = knn.predict(xTest)

R2CV = cross_val_score(knn,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(knn,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

neighbor = {"n_neighbors":np.arange(1,10)}

cv = GridSearchCV(knn,neighbor,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
print(cv.best_params_)
# 7

knntuned = KNeighborsRegressor(n_neighbors=7).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(knntuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# -0.15
errorCVtuned = -cross_val_score(knntuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# ARTIFICIAL NEURAL NETWORKS MODELS & ERROR & TUNING & PREDICT

In [None]:
scaler = StandardScaler().fit(xTrain,yTrain)
xRTrain = scaler.transform(xTrain)

mlp = MLPRegressor().fit(xRTrain,yTrain)
predict = mlp.predict(xTest)

R2CV = cross_val_score(mlp,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(mlp,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"alpha":[0.0001,0.001,0.01,0.1,0.2],
         "hidden_layer_sizes": [(20,20),(100,200,150),(300,200,250)],
         "activation": ["relu","logistic"]}

# cv = GridSearchCV(mlp,params,cv=10,verbose=False,n_jobs=-1).fit(xRTrain,yTrain)
# print(cv.best_params_)
# {'activation': 'relu', 'alpha': 0.2, 'hidden_layer_sizes': (300, 200, 250)}

mlptuned = MLPRegressor(activation="relu",alpha=0.2,hidden_layer_sizes=(300,200,250))

R2CVtuned = cross_val_score(mlptuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# -83
errorCVtuned = -cross_val_score(mlptuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# REGRESSION TREES(CART) MODELS & ERROR & TUNING & PREDICT

In [None]:
cart = DecisionTreeRegressor().fit(xTrain,yTrain)
predict = cart.predict(xTest)

R2CV = cross_val_score(cart,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
errorCV = -cross_val_score(cart,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"min_samples_split":range(2,100),
         "max_leaf_nodes":range(2,10)}


# cv = GridSearchCV(cart,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'max_leaf_nodes': 2, 'min_samples_split': 2}

carttuned = DecisionTreeRegressor(max_leaf_nodes=2,min_samples_split=2).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(carttuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.19
errorCVtuned = -cross_val_score(carttuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))


# BAGGING MODELS & ERROR & TUNING & PREDICT

In [None]:
bagg = BaggingRegressor(random_state=42,bootstrap_features=True).fit(xTrain,yTrain)
predict = bagg.predict(xTest)

R2CV = cross_val_score(bagg,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.14
errorCV = -cross_val_score(bagg,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

estimators = {"n_estimators": range(2,30)}

# cv = GridSearchCV(bagg,estimators,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'n_estimators': 9}

baggtuned = BaggingRegressor(bootstrap_features=True,random_state=42, n_estimators=9).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(baggtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.10
errorCVtuned = -cross_val_score(baggtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# RANDOM FORESTS (RF) MODELS & ERROR & TUNING & PREDICT

In [None]:
rf = RandomForestRegressor().fit(xTrain,yTrain)
predict = rf.predict(xTest)

R2CV = cross_val_score(rf,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.15
errorCV = -cross_val_score(rf,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"max_depth":range(1,20),
         "max_features":[3,5,10,15, 20],
         "n_estimators": [200,300,500,1000,2000]}

# cv = GridSearchCV(rf,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'max_depth': 14, 'max_features': 3, 'n_estimators': 300}

frtuned = RandomForestRegressor(max_depth=14,max_features=3,n_estimators=300).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(frtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.20
errorCVtuned = -cross_val_score(frtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# GRADIENT BOOSTING MACHINES (GBM) MODELS & ERROR & TUNING & PREDICT

In [None]:
gbm = GradientBoostingRegressor().fit(xTrain,yTrain)
predict = gbm.predict(xTest)

R2CV = cross_val_score(gbm,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.05
errorCV = -cross_val_score(gbm,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"learning_rate": [0.001, 0.01, 0.1, 0.2],
          "max_depth": [3, 5, 8, 10],
          "n_estimators": [200, 300, 500, 1000, 2000],
          "subsample": [1, 0.5, 0.75]}

# cv = GridSearchCV(gbm,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.5}

gbmtuned = GradientBoostingRegressor(learning_rate=0.01,max_depth=3,
                                     n_estimators=200,subsample=0.5).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(gbmtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.23
errorCVtuned = -cross_val_score(gbmtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# EXTREME GRADIENT BOOSTING (XGBOOST) MODELS & ERROR & TUNING & PREDICT

In [None]:
xgb = XGBRegressor().fit(xTrain,yTrain)
predict = xgb.predict(xTest)

R2CV = cross_val_score(xgb,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.15
errorCV = -cross_val_score(xgb,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"colsample_bytree": [0.4, 0.5, 0.6, 0.9, 1],
          "n_estimators": [100, 200, 500, 1000],
          "max_depth": [2, 3, 4, 5, 6],
          "learning_rate": [0.1, 0.01, 0.5]}

# cv = GridSearchCV(xgb,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'colsample_bytree': 0.9}

xgbtuned = XGBRegressor(colsample_bytree=0.9,
                        n_estimators=100, learning_rate=0.1, max_depth=2).fit(xTrain, yTrain)

R2CVtuned = cross_val_score(xgbtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.14
errorCVtuned = -cross_val_score(xgbtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))


# LIGHT GRADIENT BOOSTING (lIGHT GBM) MODELS & ERROR & TUNING & PREDICT

In [None]:
lgbm = LGBMRegressor().fit(xTrain,yTrain)
predict = lgbm.predict(xTest)

R2CV = cross_val_score(lgbm,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.16
errorCV = -cross_val_score(lgbm,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {
    "n_estimators": [100, 200, 500, 1000],
    "max_depth": [2, 3, 4, 5, 6],
    "learning_rate": [0.1, 0.01, 0.5]
}


# cv = GridSearchCV(lgbm,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 200}

lgbmtuned = LGBMRegressor(learning_rate=0.01,max_depth=2,n_estimators=200).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(lgbmtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.19
errorCVtuned = -cross_val_score(lgbmtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# CATEGORY BOOSTING (CATBOOST) MODELS & ERROR & TUNING & PREDICT

In [None]:
catb = CatBoostRegressor(verbose=False).fit(xTrain,yTrain)
predict = catb.predict(xTest)

R2CV = cross_val_score(catb,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CV)
# 0.06
errorCV = -cross_val_score(catb,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCV))

params = {"depth": [2,3,4,5,6,7],
         "learning_rate": [0.1,0.01,0.001,0.5]}

# cv = GridSearchCV(catb,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'depth': 4, 'learning_rate': 0.01}

catbtuned = CatBoostRegressor(verbose=False,depth=4,learning_rate=0.01).fit(xTrain,yTrain)

R2CVtuned = cross_val_score(catbtuned,xTest,yTest,cv=10,scoring="r2").mean()
print(R2CVtuned)
# 0.20
errorCVtuned = -cross_val_score(catbtuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errorCVtuned))

# COMPARISON REGRESSOR

In [None]:
models = [lm,pls,ridgetuned,elastictuned,knntuned,mlp,
          carttuned,bagg,frtuned,gbmtuned,xgb,lgbmtuned,catbtuned]


for model in models:
    name = model.__class__.__name__
    predict = model.predict(xTest)
    accuracy = r2_score(yTest, predict)
    print("-" * 28)
    print(name + ": ")
    print(f"Accuracy: {accuracy}")
    

# REGRESSOR MODELS FOR TRAIN

In [None]:
models = [lm,pls,ridgetuned,elastictuned,knntuned,mlp,
          carttuned,bagg,frtuned,gbmtuned,xgb,lgbmtuned,catbtuned]

for model in models:
    name = model.__class__.__name__
    predict = model.predict(xTrain)
    accuracy = r2_score(yTrain, predict)
    print("-" * 28)
    print(name + ": ")
    print(f"Accuracy: {accuracy}")
    

# LOGISTIC REGRESSION MODELS & ERROR & TUNING & PREDICT

In [None]:
lj = LogisticRegression(solver="liblinear").fit(xTrain,yTrain)
predict = lj.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(lj,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.82
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

# GAUSSIAN NAIVE BAYES MODELS & ERROR & TUNING & PREDICT

In [None]:
gnb = GaussianNB().fit(xTrain,yTrain)
predict = gnb.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(gnb,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.82
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

# KNN MODELS & ERROR & TUNING & PREDICT

In [None]:
kn = KNeighborsClassifier().fit(xTrain,yTrain)
predict = kn.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(kn,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.43
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"n_neighbors": np.arange(1,50)}

# cv = GridSearchCV(kn,params,cv=10).fit(xTrain,yTrain)
# print(cv.best_params_)
# print(cv.best_score_)
# 'n_neighbors': 1

kntuned = KNeighborsClassifier(n_neighbors=1).fit(xTrain,yTrain)
predicttuned = kntuned.predict(xTest)

print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(kntuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.65
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# NEURAL NETWORKS CLASSIFIER MODELS & ERROR & TUNING & PREDICT

In [None]:
scaler = StandardScaler().fit(xTrain, yTrain)
xRTrain = scaler.transform(xTrain)
xRTest = scaler.transform(xTest)

mlpc = MLPClassifier().fit(xRTrain,yTrain)
predict = mlpc.predict(xRTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(mlpc,xRTest,yTest,cv=10).mean()
print(R2CV)
# 0.79
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"alpha": [0.01,0.02,0.005,0.001,0.0001],
         "hidden_layer_sizes": [(3,5),(5,3),(10,10,10),(100,200,150),(100,100,100)],
         "solver": ["lbfgs","adam","sgd"],
         "activation": ["relu","logistic"]}

# cv = GridSearchCV(mlpc,params,cv=10,verbose=False,n_jobs=-1).fit(xRTrain,yTrain)
# print(cv.best_params_)

mlpctuned = MLPClassifier(alpha=0.1,solver="adam",
                          activation="relu",hidden_layer_sizes=(100,100,100)).fit(xRTrain,yTrain)

predicttuned = mlpctuned.predict(xRTest)

print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(mlpctuned,xRTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.72
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))


# REGRESSION CLASSIFIER TREES(CART) MODELS & ERROR & TUNING & PREDICT

In [None]:
cartc = DecisionTreeClassifier().fit(xTrain,yTrain)
predict = cartc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(cartc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.74
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"max_depth":range(1,20),
         "min_samples_split":range(2,50)}

# cv = GridSearchCV(cartc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)
# {'max_depth': 1, 'min_samples_split': 2}

cartctuned = DecisionTreeClassifier(max_depth=1,min_samples_split=2).fit(xTrain,yTrain)
predicttuned = cartctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(cartctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.83
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# RANDOM FOREST CLASSIFIER (RF) MODELS & ERROR & TUNING & PREDICT

In [None]:
rfc = RandomForestClassifier(random_state=42).fit(xTrain,yTrain)
predict = rfc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(rfc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.81
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"max_depth": [2, 5, 8, 10],
          "max_features": [2, 5, 8],
          "n_estimators": [10, 500, 1000],
          "min_samples_split": [2, 5, 10]}

# cv = GridSearchCV(rfc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)

rfctuned = RandomForestClassifier(max_depth=10, max_features=8,
                                      min_samples_split=10, n_estimators=1000).fit(xTrain, yTrain)

predicttuned = rfctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(rfctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.83
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# GBM CLASSIFIER MODELS & ERROR & TUNING & PREDICT

In [None]:
gbmc = GradientBoostingClassifier().fit(xTrain,yTrain)
predict = gbmc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(gbmc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.82
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"learning_rate": [0.001,0.01,0.1,0.05],
         "max_depth": [3,5,10],
         "n_estimators": [100,300,500,1000],
         "min_samples_split":[2,5,10]}

# cv = GridSearchCV(gbmc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)

gbmctuned = GradientBoostingClassifier(learning_rate=0.1, max_depth=3,
                                           min_samples_split=10, n_estimators=1000).fit(xTrain, yTrain)

predicttuned = gbmctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(gbmctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.81
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# XGBOOST CLASSIFIER MODELS & ERROR & TUNING & PREDICT

In [None]:
xgbc = XGBClassifier(verbose=False).fit(xTrain,yTrain)
predict = xgbc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(xgbc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.84
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"n_estimators": [100, 500, 1000, 2000],
          "subsample": [0.6, 0.8, 1.0],
          "max_depth": [3, 4, 5, 6],
          "learning_rate": [0.1, 0.01, 0.02, 0.05],
          "min_samples_split": [2, 5, 10]}

# cv = GridSearchCV(xgbc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)

xgbctuned = XGBClassifier(learning_rate=0.01, max_depth=6, min_samples_split=2,
                              n_estimators=100, subsample=0.8).fit(xTrain, yTrain)

predicttuned = xgbctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(xgbctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.82
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# LIGHTGBM CLASSIFIER MODELS & ERROR & TUNING & PREDICT

In [None]:
lgbmc = LGBMClassifier().fit(xTrain,yTrain)
predict = lgbmc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(lgbmc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.77
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"n_estimators": [100, 500, 1000, 2000],
          "subsample": [0.6, 0.8, 1.0],
          "max-depth": [3, 4, 5, 6],
          "learning_rate": [0.1, 0.01, 0.02, 0.05],
          "min_child_samples": [5, 10, 20]}

# cv = GridSearchCV(lgbmc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# print(cv.best_params_)

lgbmctuned = LGBMClassifier(learning_rate=0.01, max_depth=3, min_child_samples=20,
                           n_estimators=500, subsample=0.5).fit(xTrain, yTrain)

predicttuned = lgbmctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(lgbmctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.81
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# CATBOOST CLASSIFIER MODELS & ERROR & TUNING & PREDICT

In [None]:
catbc = CatBoostClassifier(verbose=False).fit(xTrain,yTrain)
predict = catbc.predict(xTest)

print(accuracy_score(yTest,predict))
R2CV = cross_val_score(catbc,xTest,yTest,cv=10).mean()
print(R2CV)
# 0.80
error = mean_squared_error(yTest,predict)
print(np.sqrt(error))

params = {"iterations": [200, 500],
          "learning_rate": [0.01, 0.05, 0.1],
          "depth": [3, 5, 8]}

# cv = GridSearchCV(catbc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
# rint(cv.best_params_)

catbctuned = CatBoostClassifier(depth=5, iterations=200, learning_rate=0.05,verbose=False).fit(xTrain, yTrain)
predicttuned = catbctuned.predict(xTest)
print(accuracy_score(yTest,predicttuned))
R2CVtuned = cross_val_score(catbctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
# 0.83
errortuned = mean_squared_error(yTest,predicttuned)
print(np.sqrt(errortuned))

# COMPARISON CLASSIFIERS

In [None]:
models = [lj,gnb,kntuned,mlpc,cartctuned,rfctuned,gbmc,xgbc,lgbmctuned,catbctuned]
r = pd.DataFrame(columns=["MODELS","ACC"])

for model in models:
    name = model.__class__.__name__
    predict = model.predict(xTest)
    accuracy = accuracy_score(yTest, predict)
    print("-" * 28)
    print(name + ": ")
    print(f"Accuracy: {accuracy}")
    result = pd.DataFrame([[name,accuracy*100]],columns=["MODELS","ACC"])
    r = r.append(result)
    
sns.barplot(x="ACC",y="MODELS",data=r,color="r")
plt.xlabel("ACC")
plt.title("MODEL ACCURACY COMPARISON")
plt.show()