## Multiple linear regression

In [1]:
# Loading pkg and methods
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

# getting the data from df_train and splitting between X_train and y_train => we could use as well train_test_split method
train_data = df_train.values
X_train = train_data[:,0:df_train.shape[1]-1]
y_train = train_data[:,df_train.shape[1]-1]

# Now Scaling and fitting model with training data X_train and y_train
X_train = StandardScaler().fit(X_train).transform(X_train)
lr = LinearRegression().fit(X_train,y_train)

# Using X_test and y_test to check the model
test_data = df_test.values
X_test = test_data[:,0:df_test.shape[1]-1]
y_test = test_data[:,df_test.shape[1]-1]

X_test = StandardScaler().fit(X_test).transform(X_test)
y_hat = lr.predict(X_test)

# Calculating metrics of the model
print('Mean squared error for test data = ', mean_squared_error(y_test,y_hat, squared = True))
print('Mean root squared logarithmic error for test data = ', mean_squared_log_error(y_test,y_hat))

In [None]:
# Loading modeling methods to build our multiple linear regression model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

# We get the data from the dataframe df_model
data = df_model.values

# Normalizing data
dataNorm = StandardScaler().fit(data).transform(data)

# Distinguishing between feature and predicted values
X = dataNorm[:,0:dataNorm.shape[1]-1]
y = data[:,dataNorm.shape[1]-1]

# As follows we build the model, and will split the data 25 times to proceed with the model score calculation.
# Finally we will bring provide an average variance score on 25 random splits. The closer to 1, the best.
score = [];

for i in range(0,25):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=True);

    lr = LinearRegression().fit(X_train,y_train)

    value = lr.score(X_test,y_test) # Explained variance score: 1 is perfect prediction
    
    score.append(value)

print('Average variance score on 25 random splits = ', np.round(np.mean(score),decimals = 3))

# We calculate predicted values for 1 split
y_hat = lr.predict(X_test)

# Visualizing model prediction and real values to have an idea on how closer we are.
# Clearly, the best scenario would be to have a line y = x
plt.rcParams['figure.figsize'] = (14,8)

plt.plot(y_test,y_hat,'g+');
plt.xlabel('Real test values')
plt.ylabel('Predicted test values')
plt.title('Real CO2 emission vs calculated CO2 emissions of the cars => '+'Slope = '+str(np.round(m,decimals = 2))+' and intercept = '
          +str(np.round(b,decimals = 2)))

m, b = np.polyfit(y_test,y_hat,1)
plt.plot(y_test, m*y_test + b,'r')
plt.show()

## Binary logistic regression

In [None]:
# Normalizing X
X = StandardScaler().fit(X).transform(X)

# Setting training and test data from the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, shuffle=True);
X_train = np.around(X_train, decimals = 5)
X_test = np.around(X_test, decimals = 5)

# We train the binary logistic regression model and fit
clf = LogisticRegression(solver='lbfgs',max_iter=3000).fit(X_train, y_train)
    
# Predicted values from X_test
y_hat = clf.predict(X_test)

# Metrics
acc = accuracy_score(y_test, y_hat)
prec = average_precision_score(y_test, y_hat, average='macro')
rec = recall_score(y_test, y_hat, average='macro')
f1scor = f1_score(y_test,y_hat,average='binary') # for threshold in prediction of 0.5 or 50% (default)
print('Accuracy = ', acc)
    
# We will now calculate the ROC AUC and plot it
# Learn to predict each class against the other
y_score = clf.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve, ROC area and plot it => ROC AUC is to be calculated as well in binary log regression models and analysis
# with precision and recall obtained + accuracy and check cm to see if we are classifying in all classes.
fpr, tpr, threshold = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
print('ROC AUC: ', roc_auc)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='red',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC AUC for binary logistic regression ')
plt.legend(loc="lower right")
plt.show()

In [None]:
# As a function for reusage inside a same script => we could include confusion matrix calculation (indeed, confusion_matrix is already loaded)
def logRegr(X,y,test_size,max_iter):

    #!pip install scikit-learn
    #import sklearn
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import roc_curve, auc
    from sklearn.metrics import average_precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import confusion_matrix
    
    # Normalizing X
    X = StandardScaler().fit(X).transform(X);

    # Setting training and test data from the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle=True);
    X_train = np.around(X_train, decimals = 3);
    X_test = np.around(X_test, decimals = 3);

    # We train the binary logistic regression model and fit
    clf = LogisticRegression(solver='lbfgs',max_iter=max_iter).fit(X_train, y_train);
    
    # Predicted values from X_test
    y_hat = clf.predict(X_test);

    # Metrics
    acc = accuracy_score(y_test, y_hat)
    prec = average_precision_score(y_test, y_hat, average='macro')
    rec = recall_score(y_test, y_hat, average='macro')
    f1scor = f1_score(y_test,y_hat,average='binary')
    
    # We will now calculate the ROC AUC and plot it
    # Learn to predict each class against the other
    y_score = clf.fit(X_train, y_train).decision_function(X_test);

    # Compute ROC curve, ROC area and plot it
    fpr, tpr, threshold = roc_curve(y_test, y_score);
    roc_auc = auc(fpr, tpr);
    print('ROC AUC: ', roc_auc)

    plt.figure();
    lw = 2;
    plt.plot(fpr, tpr, color='red',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc);
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--');
    plt.xlim([0.0, 1.0]);
    plt.ylim([0.0, 1.05]);
    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate');
    plt.title('ROC AUC for binary logistic regression ');
    plt.legend(loc="lower right");
    plt.show()
    
    print(confusion_matrix(y_test, y_hat))
    
    return[acc, prec, rec, f1scor, X_train, X_test, y_train, y_test, y_hat];

## Multiclass logistic regression

In [None]:
# We import some methods that will be used
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix

# Get the data from the df
data = df.to_numpy()

# We get in variable X all the variables data and normalize them
X = data[:,0:data.shape[1]-1]
X = StandardScaler().fit(X).transform(X)

# Dependent variable to predict is stored in y
y = data[:,data.shape[1]-1]

# We make a partition of the X and y data, to have training and test data --> we use 20% of the data for test purposes and we take them randomly with shuffle = True
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)

# We create the logistic regression model for this multiclass case, and we train it 
clf = LogisticRegression(multi_class = 'multinomial', solver='lbfgs').fit(X, y)

# We use the model to calculate the y_hat values
y_hat = clf.predict(X_test)

# We calculate metrics: f1_score
f1score = f1_score(y_test,y_hat, average = 'macro').round(decimals = 3)
accuracy = accuracy_score(y_test,y_hat).round(decimals = 3)

# We display confusion matrix to understand better where the model is working better/worst 
# Also remember a) that the diagonal shows the well classified cases b) in axis Y we have true label values and X we have predicted label values
cm = confusion_matrix(y_test, y_hat)
cm
# We plot the confusion matrix, that allow us to visualize where our model is behaving
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

# Note that by default, the prediction is taking the label that has more probabilities.
# By extracting the probabilities calculated using the model, we could see furthermore where the model is more confident and in which cases.
y_hat_prob = clf.predict_proba(X_test)
print(y_hat_prob)

## Decision trees

In [None]:
# Importing the packages and methods
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix

# Build the decision tree model and train it
depth = 4 # depth of the tree
DT = tree.DecisionTreeClassifier(criterion="entropy",max_depth=depth)
DT = DT.fit(X_train,y_train)
    
# Predicted values from X_test
y_hat = DT.predict(X_test)

# Getting metrics
print("DecisionTrees's f1_score: ", f1_score(y_test,y_hat, average = 'macro'))
DT_accuracy = accuracy_score(y_test,y_hat).round(decimals = 3)
print('DT_accuracy_score = ', DT_accuracy)

# Getting confusion matrix
cm = confusion_matrix(y_test, y_hat)
print(cm)
    
cmfor0 = cm[0,0]/np.sum(cm[0,]); print(cmfor0)
cmfor1 = cm[1,1]/np.sum(cm[1,]); print(cmfor1)

# PLotting the tree
plt.figure(figsize=(160,50))
a = plot_tree(DT,
              class_names=['Setosa','Versicolor','Virginica'],
              feature_names=['sepal length','sepal width','petal.length','petal.width'],
              filled=True, 
              rounded=True, 
              fontsize=80)

## SVM for multiclass classification

In [None]:
# Loading packages and methods
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Traning and fitting the model
SVM = SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)

# Calculating predictions
y_hat = SVM.predict(X_test)

# Accuracy score for the model SVM
SVM_accuracy = accuracy_score(y_test,y_hat).round(decimals = 3)
print('SVM_accuracy_score = ', SVM_accuracy)