Mounting the Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive

Mounted at /content/drive
/content/drive


Importing the libraries


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt                                      #for plotting
from sklearn.linear_model import LogisticRegression                  #algorithm1
from sklearn.naive_bayes import GaussianNB                           #algorithm2
from sklearn.tree import DecisionTreeClassifier                      #algorithm3
from sklearn.metrics import roc_auc_score, roc_curve, auc,r2_score   #performance metrics
from sklearn.ensemble import AdaBoostClassifier                      #for boosting
from sklearn.ensemble import VotingClassifier                        #for maxvoting

Training and testing dataset

In [3]:
data = pd.read_csv('/content/drive/MyDrive/ML DS539/CoLab/training.csv', encoding='latin1')
test_data = pd.read_csv('/content/drive/MyDrive/ML DS539/CoLab/testing.csv', encoding='latin1')

# Classification Section

Dropping the redundant columns

In [4]:
col = ["Next_Year_Crime_Rate", "state"]

for i in col:
    data = data.drop(i, axis=1)
    test_data = test_data.drop(i, axis=1)


In [5]:
X = data.drop("Next_Year_Top_10", axis=1)
Y = data["Next_Year_Top_10"]

In [6]:
X_test = test_data.drop("Next_Year_Top_10", axis=1)
Y_test = test_data["Next_Year_Top_10"]

Scaling the values

In [7]:
X["year"] = [(1 + (i-1980)) for i in X["year"]]
X_test["year"] = [(1 + (i-1980)) for i in X_test["year"]]

#This allows for the scaling of the years to be the same (i.e. 1980 is the same distance away from 1981 as 1981 is from 1982) and the timeline/linearity to stay the same as well (i.e. 1980 < 1981)

Build our model using Decision Tree, SVM and Gaussian Naive Bayes

In [8]:
estimators = []

lr_model = LogisticRegression()
estimators.append(("Logistic Regression: ", lr_model))

dt_model = DecisionTreeClassifier()
estimators.append(("Decision Tree: ", dt_model))

gb_model = GaussianNB()
estimators.append(("Gaussian Naive Bayes: ", gb_model))

Building individual model with each classifier

In [9]:
#individual models accuracy score
from sklearn.metrics import accuracy_score, classification_report
#j = 1
#i = range(3)
#for n in i:
for each_estim in (lr_model, dt_model, gb_model):
    each_estim.fit(X, Y)
    Y_pred = each_estim.predict(X_test)
    print("\n")
    print(each_estim, accuracy_score(Y_test,Y_pred))
    print(classification_report(Y_test,Y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




LogisticRegression() 0.8470588235294118
              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91       205
         1.0       0.72      0.36      0.48        50

    accuracy                           0.85       255
   macro avg       0.79      0.66      0.70       255
weighted avg       0.83      0.85      0.83       255



DecisionTreeClassifier() 0.8784313725490196
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93       205
         1.0       0.81      0.50      0.62        50

    accuracy                           0.88       255
   macro avg       0.85      0.74      0.77       255
weighted avg       0.87      0.88      0.87       255



GaussianNB() 0.792156862745098
              precision    recall  f1-score   support

         0.0       0.96      0.78      0.86       205
         1.0       0.48      0.86      0.62        50

    accuracy                           0.79       255
   macr

#Max-Voting

Ensemble the models using *VotingClassifier*

In [10]:
#ensemble our models and use VotingClassifier to score the accuracy of the ensemble model

ensemble_model_hard = VotingClassifier(estimators=estimators, voting='hard')             #hard: uses predicted class labels for majority rule voting
ensemble_model_soft = VotingClassifier(estimators=estimators, voting='soft')             #soft: predicts the class label based on the argmax of the sums of the predicted probabilities

Calculate accuracy of the model

In [11]:
ensemble_model_hard.fit(X, Y)
ensemble_model_soft.fit(X, Y)
ensemble_pred_hard = ensemble_model_hard.predict(X_test)
ensemble_pred_soft = ensemble_model_soft.predict(X_test)
ensemble_score_hard = accuracy_score(Y_test, ensemble_pred_hard)
ensemble_score_soft = accuracy_score(Y_test, ensemble_pred_soft)

print("\n")
print("Ensemble CLassifier using Hard Voting: ", ensemble_score_hard)
print("Ensemble CLassifier using Soft Voting: ", ensemble_score_soft)

print("\n")
print(classification_report(Y_test,ensemble_pred_hard))
print(classification_report(Y_test,ensemble_pred_soft))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression




Ensemble CLassifier using Hard Voting:  0.8784313725490196
Ensemble CLassifier using Soft Voting:  0.8862745098039215


              precision    recall  f1-score   support

         0.0       0.90      0.95      0.93       205
         1.0       0.74      0.58      0.65        50

    accuracy                           0.88       255
   macro avg       0.82      0.77      0.79       255
weighted avg       0.87      0.88      0.87       255

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93       205
         1.0       0.77      0.60      0.67        50

    accuracy                           0.89       255
   macro avg       0.84      0.78      0.80       255
weighted avg       0.88      0.89      0.88       255



# Boosting

In [12]:
gb_model = GaussianNB()     #weak learner
gb_model.fit(X,Y)

GaussianNB()

In [13]:
print("The accuracy is:", (gb_model.score(X_test, Y_test))*100, '%')

The accuracy is: 79.2156862745098 %


In [15]:
#Adaboost: combine multiple “weak classifiers” into a single “strong classifier”

Adaboost = AdaBoostClassifier(n_estimators=200, base_estimator=gb_model, learning_rate=0.5, random_state=0)
Adaboost.fit(X,Y)


print("The accuracy is: ", (Adaboost.score(X_test, Y_test))*100, '%')

The accuracy is:  80.0 %


# Regression Section

In [16]:
#Importing the Bagging library
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression

In [17]:
data = pd.read_csv('/content/drive/MyDrive/ML DS539/CoLab/training.csv', encoding='latin1')
test_data = pd.read_csv('/content/drive/MyDrive/ML DS539/CoLab/testing.csv', encoding='latin1')

In [18]:
col = ["Next_Year_Top_10", "state"]

for i in col:
    data = data.drop(i, axis=1)
    test_data = test_data.drop(i, axis=1)

In [19]:
X = data.drop("Next_Year_Crime_Rate", axis=1)
Y = data["Next_Year_Crime_Rate"]

In [20]:
X_test = test_data.drop("Next_Year_Crime_Rate", axis=1)
Y_test = test_data["Next_Year_Crime_Rate"]

In [21]:
X["year"] = [(1 + (i-1980)) for i in X["year"]]
X_test["year"] = [(1 + (i-1980)) for i in X_test["year"]]

#This allows for the scaling of the years to be the same (i.e. 1980 is the same distance away from 1981 as 1981 is from 1982) and the timeline/linearity to stay the same as well (i.e. 1980 < 1981)

# Bagging

In [22]:
#Bagging: ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions
regressors = LinearRegression()
bag_lr_model = BaggingRegressor(regressors, max_features=1.0, n_estimators=5, bootstrap=True, random_state=1)

bag_lr_model.fit(X,Y)
bag_lr_model.score(X_test, Y_test)
pred_val = bag_lr_model.predict(X_test)

Performance metrics for Bagging Regressor

In [23]:
from sklearn import metrics

print("The accuracy score is: ", r2_score(Y_test, pred_val))

print ("mean_absolute_error :",metrics.mean_absolute_error(Y_test, pred_val))
print("\n")
print ("mean_squared_error : ",metrics.mean_squared_error(Y_test, pred_val))
print("\n")
print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(Y_test, pred_val)))

The accuracy score is:  0.48514645620429786
mean_absolute_error : 1.2375475532804312


mean_squared_error :  3.964826317999714


root_mean_squared_error :  1.9911871629758249


# Weighted Averaging

In [None]:
import tensorflow as tf
from sklearn.tree import DecisionTreeRegressor

In [None]:
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=10, activation="relu"))
ann.add(tf.keras.layers.Dense(units=10, activation="relu"))
ann.add(tf.keras.layers.Dense(units=10, activation="relu"))
ann.add(tf.keras.layers.Dense(units=1))

In [None]:
ann.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
#For weighted averaging, we use:
#-Decision Tree
#-Linear Regression
#-Artificial Neural Networks

avg_lr_model = LinearRegression()
regtree_model = DecisionTreeRegressor()

avg_lr_model.fit(X,Y)
ann.fit(X,Y, epochs=100)
regtree_model.fit(X,Y)

In [None]:
linreg_preds = avg_lr_model.predict(X_test)
ann_preds = ann.predict(X_test)
regtree_preds = regtree_model.predict(X_test)

In [None]:
#weighted averaging: takes into account the varying degrees of importance of different algorithms in a data set

weighted_avg_preds=(linreg_preds * 0.3 + ann_preds * 0.4 + regtree_preds * 0.3)

In [None]:
weighted_avg_preds