# PACKAGES AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from mpl_toolkits.mplot3d import Axes3D
import statsmodels.api as sm
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import scale
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve

In [None]:
filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning) 

# DATA SOURCE

In [None]:
Diabetes = pd.read_csv("../input/diabetes-data-set/diabetes.csv") # main

In [None]:
data = Diabetes.copy() # copy to protect master data

In [None]:
df = data.select_dtypes(include=["float64","int64","int32"]) # for mathematical analysis, to guarantee even though it is not necessary

## Exploratory Data Analysis

In [None]:
print(data.head(10))

In [None]:
print(data.shape)

In [None]:
print(data.columns)

In [None]:
print(data.info())

In [None]:
print(data.describe().T)

In [None]:
print(df.corr())

In [None]:
print(data.groupby(["Outcome"])["Pregnancies"].mean())

In [None]:
print(data.groupby(["Outcome"])["Glucose"].mean())

In [None]:
print(data.groupby(["Outcome"])["BloodPressure"].mean())

In [None]:
print(data.groupby(["Outcome"])["SkinThickness"].mean())

In [None]:
print(data.groupby(["Outcome"])["Insulin"].mean())

In [None]:
print(data.groupby(["Outcome"])["BMI"].mean())

In [None]:
print(data.groupby(["Outcome"])["DiabetesPedigreeFunction"].mean())

In [None]:
print(data.groupby(["Outcome"])["Age"].mean())

Variables must not contain 0.
Within this data, 0 means NaN.

In [None]:
print(data.isnull().sum())

It seems there is no missing value, but it is not compatible

In [None]:
print(data.where(data["Pregnancies"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["Glucose"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["BloodPressure"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["SkinThickness"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["Insulin"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["BMI"] < 0.1).value_counts().sum())

In [None]:
print(data.where(data["DiabetesPedigreeFunction"] < 0.1).value_counts().sum()) # it may not be included in the process as it is proportional

In [None]:
print(data.where(data["Age"] < 0.1).value_counts().sum())

We must define 0 values as NaN

In [None]:
data[["Pregnancies","Glucose","BloodPressure",
     "SkinThickness","Insulin","BMI"]] = data[["Pregnancies","Glucose","BloodPressure",
     "SkinThickness","Insulin","BMI"]].replace(0,np.NaN)

In [None]:
print(data.isnull().sum()) # checking again missing values

Let's take a look at the correlation between missing data by visualizing it

In [None]:
msno.matrix(data,figsize=(10,8))
plt.show()

In [None]:
msno.heatmap(data,figsize=(10,8))
plt.show()

In [None]:
msno.bar(data,figsize=(10,8))
plt.show()

Let's add the class mean corresponding to that data instead of the NaN data.

In [None]:
print(data.groupby(["Outcome"])["Pregnancies"].median())
data["Pregnancies"].fillna(data.groupby(["Outcome"])["Pregnancies"].transform("median"),inplace=True)

In [None]:
print(data.groupby(["Outcome"])["Glucose"].median())
data["Glucose"].fillna(data.groupby(["Outcome"])["Glucose"].transform("median"),inplace=True)

In [None]:
print(data.groupby(["Outcome"])["BloodPressure"].median())
data["BloodPressure"].fillna(data.groupby(["Outcome"])["BloodPressure"].transform("median"),inplace=True)

In [None]:
print(data.groupby(["Outcome"])["SkinThickness"].median())
data["SkinThickness"].fillna(data.groupby(["Outcome"])["SkinThickness"].transform("median"),inplace=True)

In [None]:
print(data.groupby(["Outcome"])["Insulin"].median())
data["Insulin"].fillna(data.groupby(["Outcome"])["Insulin"].transform("median"),inplace=True)

In [None]:
print(data.groupby(["Outcome"])["BMI"].median())
data["BMI"].fillna(data.groupby(["Outcome"])["BMI"].transform("median"),inplace=True)

In [None]:
print(data.isnull().sum()) # checking again missing values

In [None]:
print(data.head(10)) # checking again data

# OUTLIER VALUES

In [None]:
DataForA = data.copy() # protect master copy data

In [None]:
clf = LocalOutlierFactor()
clf.fit_predict(DataForA)

In [None]:
score = clf.negative_outlier_factor_
sortedscore = np.sort(score)
print(sortedscore) # checking outlier, look where the biggest jump took place
# 5.index is the place for this example

In [None]:
point = sortedscore[5]
print(point)
print(DataForA[score == point])

In [None]:
outliers = DataForA < point
print(DataForA[outliers].value_counts().sum())
# it is not a problem for that data

# CORRELATION - NORMALITY - HOMOGENEITY

## Correlation

In [None]:
corrPearson = data.corr(method="pearson")
corrSpearman = data.corr(method="spearman")
# correlation types

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrPearson,cmap="RdYlGn",vmax=1,vmin=-1,center=0,annot=True)
plt.show()

In [None]:
figure = plt.figure(figsize=(10,8))
sns.heatmap(corrSpearman,cmap="RdYlGn",vmax=1,vmin=-1,center=0,annot=True)
plt.show()

## Normality

In [None]:
for i in data.columns[0:8]:
    print("---"*30)
    print(i)
    print("%.3f - %.3f" % shapiro(data[i]))

## Homogeneity

In [None]:
print("%.3f - %.3f" % levene(data["Pregnancies"],data["Glucose"],data["BloodPressure"],
                            data["SkinThickness"],data["Insulin"],data["BMI"],
                            data["DiabetesPedigreeFunction"],data["Age"]))

# VISUALIZATION

### BASIC HIST

In [None]:
data.iloc[:,0:8].hist(figsize=(20,20))
plt.show()

 ### BOX

In [None]:
dataV = data.copy()
dataV["Outcome"] = pd.Categorical(dataV["Outcome"])

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="Pregnancies",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="Glucose",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="Pregnancies",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="BloodPressure",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="SkinThickness",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="Insulin",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="BMI",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="DiabetesPedigreeFunction",y="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x="Age",y="Outcome",data=dataV)
plt.show()

### LINE

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Pregnancies",y="Glucose",hue="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="BloodPressure",y="SkinThickness",hue="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="Insulin",y="BMI",hue="Outcome",data=dataV)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.lineplot(x="DiabetesPedigreeFunction",y="Age",hue="Outcome",data=dataV)
plt.show()

### HISTPLOT

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="Pregnancies", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="Glucose", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="BloodPressure", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="SkinThickness", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="Insulin", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="BMI", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,5))
sns.histplot(
    data,
    x="DiabetesPedigreeFunction", hue="Outcome",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)
plt.show()

### JOINTPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="Pregnancies",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="Glucose",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="BloodPressure",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="SkinThickness",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="Insulin",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.jointplot(x="BMI",y="DiabetesPedigreeFunction", color="#4CB391",hue="Outcome",data=data)
plt.show()

### DISTPLOT

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["Pregnancies"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["Pregnancies"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["Glucose"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["Glucose"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["BloodPressure"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["BloodPressure"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["SkinThickness"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["SkinThickness"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["Insulin"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["Insulin"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["BMI"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["BMI"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["DiabetesPedigreeFunction"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["DiabetesPedigreeFunction"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.distplot(data[data['Outcome'] == 0]["Age"], color='black',label='No Diabetes') 
sns.distplot(data[data['Outcome'] == 1]["Age"], color='red',label='Yes Diabetes') #Red for heart disease
plt.title('DIABETES', fontsize=10)
plt.legend()

### JOINTGRID

In [None]:
figure = plt.figure(figsize=(20,8))
g = sns.JointGrid(data=data, x="Pregnancies", y="Outcome", space=0)
g.plot_joint(sns.kdeplot,cmap="rocket")
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
g = sns.JointGrid(data=data, x="Glucose", y="Outcome", space=0)
g.plot_joint(sns.kdeplot,cmap="rocket")
plt.show()

In [None]:
figure = plt.figure(figsize=(20,8))
g = sns.JointGrid(data=data, x="Insulin", y="Outcome", space=0)
g.plot_joint(sns.kdeplot,cmap="rocket")
plt.show()

### 3D CHA.

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["Glucose"], data["Insulin"], data["Pregnancies"], c="red", s=20, alpha=0.2)
plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))
ax = Axes3D(fig)
ax.scatter(data["BMI"], data["SkinThickness"], data["Pregnancies"] ,c="green", s=20, alpha=0.2)
plt.show()

# MODEL PROCESS FOR CLASS CLASSIFICATION

### Dependent and Independent Variables ( X & Y )

In [None]:
x = data.drop("Outcome",axis=1)
y = data["Outcome"]

xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state=42)

### Classification Models

In [None]:
lj = LogisticRegression(solver="liblinear").fit(xTrain,yTrain)
gnb = GaussianNB().fit(xTrain,yTrain)
knnc = KNeighborsClassifier().fit(xTrain,yTrain)
cartc = DecisionTreeClassifier(random_state=42).fit(xTrain,yTrain)
rfc = RandomForestClassifier(random_state=42,verbose=False).fit(xTrain,yTrain)
gbmc = GradientBoostingClassifier(verbose=False).fit(xTrain,yTrain)
xgbc = XGBClassifier().fit(xTrain,yTrain)
lgbmc = LGBMClassifier().fit(xTrain,yTrain)
catbc = CatBoostClassifier(verbose=False).fit(xTrain,yTrain)

In [None]:
modelsc = [lj,gnb,knnc,cartc,rfc,gbmc,xgbc,lgbmc,catbc]
# models for comparison

In [None]:
# comparison
for model in modelsc:
    name = model.__class__.__name__
    predict = model.predict(xTest)
    R2CV = cross_val_score(model,xTest,yTest,cv=10,verbose=False).mean()
    error = -cross_val_score(model,xTest,yTest,cv=10,scoring="neg_mean_squared_error",verbose=False).mean()
    roc = roc_curve(yTest,predict)
    print(name + ": ")
    print("-" * 10)
    print("ACC-->",accuracy_score(yTest,predict))
    print("R2CV-->",R2CV)
    print("MEAN SQUARED ERROR-->",np.sqrt(error))
    print("ROC-->",roc)
    print("-" * 30)

Best is RandomForestClassifier -- > 0.9029166666666667 / %90 Accuracy

In [None]:
# Accuracy visualization

r = pd.DataFrame(columns=["MODELS","R2CV"])
for model in modelsc:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,xTest,yTest,cv=10,verbose=False).mean()
    result = pd.DataFrame([[name,R2CV*100]],columns=["MODELS","R2CV"])
    r = r.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="R2CV",y="MODELS",data=r,color="k")
plt.xlabel("R2CV")
plt.ylabel("MODELS")
plt.xlim(0,100)
plt.title("MODEL ACCURACY COMPARISON")
plt.show()

In [None]:
# Error visualization

r = pd.DataFrame(columns=["MODELS","error"])
for model in modelsc:
    name = model.__class__.__name__
    error = -cross_val_score(model,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
    result = pd.DataFrame([[name,np.sqrt(error)]],columns=["MODELS","error"])
    r = r.append(result)
    
figure = plt.figure(figsize=(20,8))   
sns.barplot(x="error",y="MODELS",data=r,color="r")
plt.xlabel("ERROR")
plt.ylabel("MODELS")
plt.xlim(0,2)
plt.title("MODEL ERROR COMPARISON")
plt.show()

### CNN - ADDITIONAL

In [None]:
scaler = StandardScaler().fit(xTrain,yTrain)
xRTrain = scaler.transform(xTrain)
xRTest = scaler.transform(xTest)

In [None]:
mlpc = MLPClassifier().fit(xRTrain,yTrain)
predict = mlpc.predict(xRTest)

R2CV = cross_val_score(mlpc,xRTest,yTest,cv=10).mean()
print(R2CV)
error = -cross_val_score(mlpc,xRTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(error))

### LOGIT - ADDITIONAL

In [None]:
loj = sm.Logit(yTrain,xTrain).fit()
print(loj.summary())

It is 0.8387500000000001, Random Forest is still best --> 0.9029166666666667 / %90 Accuracy

# BEST MODEL RANDOM FOREST PROCESS

### Random Forest Predict Probably

In [None]:
predictprob = rfc.predict_proba(xTest)
print(predictprob)
# first columns - 0
# second columns - 1

### Random Forest Importance

In [None]:
Importance = pd.DataFrame({"Importance": rfc.feature_importances_ * 100},
                          index=xTrain.columns)


Importance.sort_values(by="Importance", axis=0, ascending=True).plot(kind="barh", color="r")
plt.show()

### Random Forest Tuning

In [None]:
params = {"max_depth": [2, 5, 8, 10],
          "max_features": [2, 5, 8],
          "n_estimators": [10, 500, 1000],
          "min_samples_split": [2, 5, 10]}
# you can change parameters

In [None]:
cv = GridSearchCV(rfc,params,cv=10,verbose=False,n_jobs=-1).fit(xTrain,yTrain)
print(cv.best_params_)
print(cv.best_score_)

### Final Random Forest Model

In [None]:
rfctuned = RandomForestClassifier(max_depth=8,max_features=8,
                                 min_samples_split=2,n_estimators=1000).fit(xTrain,yTrain)
predicttuned = rfctuned.predict(xTest)

R2CVtuned = cross_val_score(rfctuned,xTest,yTest,cv=10).mean()
print(R2CVtuned)
errortuned = -cross_val_score(rfctuned,xTest,yTest,cv=10,scoring="neg_mean_squared_error").mean()
print(np.sqrt(errortuned))

Best is still RandomForestClassifier Basic Parameters --> 0.9029166666666667 / %90 Accuracy

# PREDICTION FOR NUMERICAL VALUES 

* Pregnancies: 
Number of times pregnant

* Glucose: 
Plasma glucose concentration a 2 hours in an oral glucose tolerance test

* BloodPressure: 
Diastolic blood pressure (mm Hg)

* SkinThickness: 
Triceps skin fold thickness (mm)

* Insulin: 
2-Hour serum insulin (mu U/ml)

* BMI: 
Body mass index (weight in kg/(height in m)^2)

* DiabetesPedigreeFunction: 
Diabetes pedigree function

* Age: 
Age (years)

* Outcome: 
Class variable (0 or 1) --> we will not assume this

### Dependent and Independent Variables ( X & Y )

In [None]:
xN = data.drop(["Outcome","Glucose"],axis=1)
# we will predict Glucose tolerance for example
# you can pick other parameters
yN = data["Glucose"]


xnTrain,xnTest,ynTrain,ynTest = train_test_split(xN,yN,test_size=0.2,random_state=42)

### REGRESSION MODELS

In [None]:
lm = LinearRegression().fit(xnTrain,ynTrain)
pls = PLSRegression().fit(xnTrain,ynTrain)
ridge = Ridge().fit(xnTrain,ynTrain)
lasso = Lasso().fit(xnTrain,ynTrain)
elasticnet = ElasticNet().fit(xnTrain,ynTrain)
knnr = KNeighborsRegressor().fit(xnTrain,ynTrain)
cartr = DecisionTreeRegressor(random_state=42).fit(xnTrain,ynTrain)
baggr = BaggingRegressor(random_state=42,bootstrap_features=True,verbose=False).fit(xnTrain,ynTrain)
rfr = RandomForestRegressor(random_state=42,verbose=False).fit(xnTrain,ynTrain)
gbmr = GradientBoostingRegressor(verbose=False).fit(xnTrain,ynTrain)
xgbr = XGBRegressor().fit(xnTrain,ynTrain)
lgbmr = LGBMRegressor().fit(xnTrain,ynTrain)
catbr = CatBoostRegressor(verbose=False).fit(xnTrain,ynTrain)

In [None]:
models = [lm,pls,ridge,lasso,elasticnet,knnr,
         cartr,baggr,rfr,gbmr,xgbr,lgbmr,catbr]
# models for comparison

In [None]:
for model in models:
    name = model.__class__.__name__
    R2CV = cross_val_score(model,xnTest,ynTest,cv=10,scoring="r2").mean()
    error = -cross_val_score(model,xnTest,ynTest,cv=10,scoring="neg_mean_squared_error").mean()
    print(name + ": ")
    print("-" * 10)
    print(R2CV)
    print(np.sqrt(error))
    print("-" * 30)

### PCR - ADDITIONAL

In [None]:
pca = PCA()
xnRTrain = pca.fit_transform(scale(xnTrain))
xnRTest = pca.fit_transform(scale(xnTest))

In [None]:
lmP = LinearRegression().fit(xnRTrain,ynTrain)
R2CV = cross_val_score(lmP,xnRTest,ynTest,cv=10,scoring="r2").mean()
error = -cross_val_score(lmP,xnRTest,ynTest,cv=10,scoring="neg_mean_squared_error").mean()

print(R2CV)
print(np.sqrt(error))

### CNN - ADDITIONAL

In [None]:
scaler = StandardScaler().fit(xnTrain,ynTrain)
xnRTrain = scaler.transform(xnTrain)
xnRTest = scaler.transform(xnTest)

In [None]:
mlpr = MLPRegressor().fit(xnTrain,ynTrain)

R2CV = cross_val_score(mlpr,xnRTest,ynTest,cv=10,scoring="r2").mean()
error = -cross_val_score(mlpr,xnRTest,ynTest,cv=10,scoring="neg_mean_squared_error").mean()

print(R2CV)
print(np.sqrt(error))

### OLS - ADDITIONAL

In [None]:
ols = sm.OLS(ynTrain,xnTrain).fit()
print(ols.summary())

Best is OLS --> 0.956 / R2 %95

> Always use classifiers for any classification problems that involve 0-1