In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use("ggplot")

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold,cross_val_score, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.naive_bayes import GaussianNB

Most Features

In [None]:
#crime = pd.read_csv('crimedata2.csv', encoding = "ISO-8859-1")
crime = pd.read_csv('/content/sample_data/crimedata2.csv', encoding = "ISO-8859-1")
crime.info()

In [None]:
crime_pb = crime[['autoTheft','numbUrban', 'pctUrban', 'medIncome', 'pctWWage','pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire',
                  'medFamInc','perCapInc','NumUnderPov', 'PctPopUnderPov','PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu',
                  'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf','PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous',
                 'PctPersOwnOccup', 'PctPersDenseHous', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded','PctVacMore6Mos',
                 'MedRent', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg' ,'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85']]
#Convert '?' to NaN
crime_pb = crime_pb.replace('?', float('nan'))

#Get the correlations
crime_pb = crime_pb.apply(pd.to_numeric, errors='coerce')


#Drop rows with NaN values
crime_pb = crime_pb.dropna()
corr = crime_pb.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

Selected Features

In [None]:
autoCrime_pb= crime[['pctUrban', 'medIncome', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad',
                     'PctBSorMore','PctEmploy', 'PctUnemployed',  'PctVacMore6Mos','numbUrban',
                     'PctVacantBoarded','agePct12t29','agePct16t24','agePct65up','RentMedian',
                     'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans','autoTheft','HousVacant']]

In [None]:
autoCrime_pb =  autoCrime_pb.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_pb = autoCrime_pb.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb = autoCrime_pb.dropna()
autoCrime_pb

In [None]:
# Get the correlations
autoCrime_pb =  autoCrime_pb.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_pB = autoCrime_pb.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb = autoCrime_pb.dropna()
corr = autoCrime_pb.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

In [None]:
autoCrime_pb1= crime[['pctUrban', 'medIncome', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore','PctEmploy', 'PctUnemployed', 'HousVacant','PctVacantBoarded', 'PctVacMore6Mos',
                     'RentMedian', 'PctUsePubTrans','autoTheft']]
autoCrime_pb1 =  autoCrime_pb1.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_p1b = autoCrime_pb1.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb1 = autoCrime_pb1.dropna()

In [None]:
# Get the correlations
corr = autoCrime_pb1.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

In [None]:
autoCrime_pb.describe()

In [None]:
autoCrime_pb1.describe()

In [None]:
B1 = autoCrime_pb1[["medIncome", "HousVacant","autoTheft"]]
B1.boxplot()

In [None]:
B2 = autoCrime_pb1[['pctUrban', 'PctPopUnderPov', 'PctNotHSGrad', 'PctBSorMore','PctEmploy', 'PctUnemployed', 'PctVacMore6Mos',
                    'PctUsePubTrans']]
B2.boxplot()

In [None]:
B3 = autoCrime_pb1[[ 'RentMedian']]
B3.boxplot()

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()
autoCrime_pb1_standard = pd.DataFrame(StandardScaler().fit_transform(autoCrime_pb1),columns = autoCrime_pb1.columns)

In [None]:
autoCrime_pb1_standard.boxplot()
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.show()

In [None]:
X = autoCrime_pb[['pctUrban', 'medIncome', 'PctPopUnderPov','PctVacantBoarded','numbUrban',
       'PctNotHSGrad', 'PctBSorMore', 'PctEmploy', 'PctUnemployed', 'HousVacant',
        'PctVacMore6Mos', 'agePct12t29','agePct16t24', 'agePct65up', 'RentMedian',
        'racepctblack','racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans']]
Y= autoCrime_pb['autoTheft']

In [None]:
# Data partition. Random state can be any number, just has to be consistent throughout the code
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state =21) # Random state can be any number, just has to be consistent throughout the code

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=21),
    'Random Forest': RandomForestRegressor(random_state=21),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=21)
}


In [None]:
# k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=21)

# https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
results = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_mean_squared_error')
    rmse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_root_mean_squared_error')
    r2_scores = cross_val_score(model, X, Y, cv=kf, scoring='r2')

    results[name] = {
        'MSE': mse_scores,
        'MSE Mean': np.mean(mse_scores),
        'MSE Std': np.std(mse_scores),
        'RMSE': rmse_scores,
        'RMSE Mean': np.mean(rmse_scores),
        'RMSE Std': np.std(rmse_scores),
        'R2': r2_scores,
        'R2 Mean': np.mean(r2_scores),
        'R2 Std': np.std(r2_scores)
    }

# Print Results
for name, metrics in results.items():
    print(f"\n{name}")
    print(f"Mean MSE: {metrics['MSE Mean']:.2f}, Std MSE: {metrics['MSE Std']:.2f}")
    print(f"Mean RMSE: {metrics['RMSE Mean']:.2f}, Std RMSE: {metrics['RMSE Std']:.2f}")
    print(f"Mean R2: {metrics['R2 Mean']:.2f}, Std R2: {metrics['R2 Std']:.2f}")

In [None]:
#k = 10
kf = KFold(n_splits=10, shuffle=True, random_state=21)    # 10 folds
results1 = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_mean_squared_error')
    rmse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_root_mean_squared_error')
    r2_scores = cross_val_score(model, X, Y, cv=kf, scoring='r2')

    results1[name] = {
        'MSE': mse_scores,
        'MSE Mean': np.mean(mse_scores),
        'MSE Std': np.std(mse_scores),
        'RMSE': rmse_scores,
        'RMSE Mean': np.mean(rmse_scores),
        'RMSE Std': np.std(rmse_scores),
        'R2': r2_scores,
        'R2 Mean': np.mean(r2_scores),
        'R2 Std': np.std(r2_scores)
    }

# Print Results
for name, metrics in results1.items():
    print(f"\n{name}")
    print(f"Mean MSE: {metrics['MSE Mean']:.2f}, Std MSE: {metrics['MSE Std']:.2f}")
    print(f"Mean RMSE: {metrics['RMSE Mean']:.2f}, Std RMSE: {metrics['RMSE Std']:.2f}")
    print(f"Mean R2: {metrics['R2 Mean']:.2f}, Std R2: {metrics['R2 Std']:.2f}")

In [None]:
# LINEAR REGRESSION

k = 5
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Fit regression model
reg = LinearRegression()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []



# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    reg.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = reg.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")


In [None]:
# DECISON TREE

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

# Initialize the model
dt = DecisionTreeRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    dt.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = dt.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
# RANDOM FOREST

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


rf = RandomForestRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    rf.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = rf.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#GRADIENT BOOSTING

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
#crossvalidation = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    gbc.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = gbc.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#NAIVE BAYES

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    nb.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = nb.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")



In [None]:
#model for k =10

k = 10
crossvalidation = KFold(n_splits=k, random_state=21, shuffle=True)

In [None]:
# LINEAR REGRESSION

# Fit regression model
reg = LinearRegression()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []



# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    reg.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = reg.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import shapiro

# Assume y_test and y_pred are your observed and predicted values
residuals = Y_test - y_pred

'''
In the Residuals vs. Fitted Values Plot, residuals should be randomly scattered around zero without any apparent pattern.
A clear pattern indicates that the model may not be appropriate.
'''

# Residuals vs Fitted Values Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.show()

'''
resemble a normal distribution.
'''

# Histogram of Residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Histogram of Residuals')
plt.show()

'''
Q-Q Plot should show points lying close to the line, indicating normality.
'''

# Q-Q Plot
sm.qqplot(residuals, line='45')
plt.title('Q-Q Plot')
plt.show()

'''
Shapiro-Wilk Test: Tests the normality of the residuals.
A p-value greater than the significance level (commonly 0.05) indicates that the residuals are normally distributed.
'''

# Shapiro-Wilk Test for Normality
shapiro_test = shapiro(residuals)
print('Shapiro-Wilk Test p-value:', shapiro_test.pvalue)

In [None]:
# DECISION TREE

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

# Initialize the model
dt = DecisionTreeRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    dt.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = dt.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
# RANDOM FOREST

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


rf = RandomForestRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    rf.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = rf.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#GRADIENT BOOSTING

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
#crossvalidation = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    gbc.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = gbc.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")


In [None]:
#NAIVE BOOSTING

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
#crossvalidation = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    nb.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = nb.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#GRID SEARCH
#RIDGE REGRESSION

# Define parameter grid for Ridge
grid1 = {'alpha': [0.1, 0.5, 1,10,25,75, 85,100]}

# Ridge Regression with GridSearchCV
ridge = Ridge(random_state=21)
ridge_cv = GridSearchCV(ridge, grid1, cv=5)
ridge_cv.fit(X_train, Y_train)
ridge_best = ridge_cv.best_estimator_
print(f'Best Parameters: {ridge_best}')

# Make predictions and evaluate performance
y_pred = ridge_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# Define parameter grid for Ridge
grid1 = {'alpha': [0.1, 0.5, 1,10,25,75, 85,100]}

# Ridge Regression with GridSearchCV
ridge = Ridge(random_state=21)
ridge_cv = GridSearchCV(ridge, grid1, cv=10)
ridge_cv.fit(X_train, Y_train)
ridge_best = ridge_cv.best_estimator_
print(f'Best Parameters: {ridge_best}')

# Make predictions and evaluate performance
y_pred = ridge_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# LASSO REGRESSION

# Lasso Regression with GridSearchCV
grid2 = {'alpha': [0.1, 0.5, 1,10,25,75, 85,100]}

lasso = Lasso(random_state=21)
lasso_cv = GridSearchCV(lasso, grid2, cv=5)
lasso_cv.fit(X_train, Y_train)
lasso_best = lasso_cv.best_estimator_
print(f'Best Parameters: {lasso_best}')

# Make predictions and evaluate performance
y_pred = lasso_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')


In [None]:
# Lasso Regression with GridSearchCV
grid2 = {'alpha': [0.1, 0.5, 1,10,25,75, 85,100]}

lasso = Lasso(random_state=21)
lasso_cv = GridSearchCV(lasso, grid2, cv=10)
lasso_cv.fit(X_train, Y_train)
lasso_best = lasso_cv.best_estimator_
print(f'Best Parameters: {lasso_best}')

# Make predictions and evaluate performance
y_pred = lasso_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
#DECISION TREE

# Define parameter grid
grid3 = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
decision_tree = DecisionTreeRegressor(random_state=21)

# Perform grid search with cross-validation
grid_search = GridSearchCV(decision_tree, grid3, cv=5, scoring= 'neg_mean_squared_error')
grid_search.fit(X_train, Y_train)

# Best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Make predictions and evaluate performance
y_pred = best_model.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# Define parameter grid
grid3 = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
decision_tree = DecisionTreeRegressor(random_state=21)

# Perform grid search with cross-validation
grid_search = GridSearchCV(decision_tree, grid3, cv=10, scoring= 'neg_mean_squared_error')
grid_search.fit(X_train, Y_train)

# Best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Make predictions and evaluate performance
y_pred = best_model.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# RANDOM FOREST 

# Define parameter grid for Random Forest
grid4 = {
    'n_estimators': [5,10,15],
    'max_depth': [2,3],
    'max_features': ['sqrt']
}

# Random Forest Regressor with GridSearchCV
random_forest = RandomForestRegressor(random_state=21)
random_forest_cv = GridSearchCV(random_forest, grid4, cv=5)
random_forest_cv.fit(X_train, Y_train)
random_forest_best = random_forest_cv.best_estimator_
print(f'Best Parameters: {random_forest_best}')

# Make predictions and evaluate performance
y_pred = random_forest_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# Define parameter grid for Random Forest
grid4 = {
    'n_estimators': [5,10,15],
    'max_depth': [2,3],
    'max_features': ['sqrt']
}

# Random Forest Regressor with GridSearchCV
random_forest = RandomForestRegressor(random_state=21)
random_forest_cv = GridSearchCV(random_forest, grid4, cv=10)
random_forest_cv.fit(X_train, Y_train)
random_forest_best = random_forest_cv.best_estimator_
print(f'Best Parameters: {random_forest_best}')

# Make predictions and evaluate performance
y_pred = random_forest_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# GRADIENT BOOSTING

# Define parameter grid for Gradient Boosting
grid5 = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}

# Gradient Boosting Regressor with GridSearchCV
gradient_boosting = GradientBoostingRegressor(random_state=21)

gradient_boosting_cv = GridSearchCV(gradient_boosting, grid5, cv=5)
gradient_boosting_cv.fit(X_train, Y_train)
gradient_boosting_best = gradient_boosting_cv.best_estimator_
print(f'Best Parameters: {gradient_boosting_best}')

# Make predictions and evaluate performance
y_pred = gradient_boosting_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# Define parameter grid for Gradient Boosting
grid6 = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5]
}

# Gradient Boosting Regressor with GridSearchCV
gradient_boosting = GradientBoostingRegressor(random_state=21)
gradient_boosting_cv = GridSearchCV(gradient_boosting, grid6, cv=10)
gradient_boosting_cv.fit(X_train, Y_train)
gradient_boosting_best = gradient_boosting_cv.best_estimator_
print(f'Best Parameters: {gradient_boosting_best}')

# Make predictions and evaluate performance
y_pred = best_model.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
# NAIVE BAYES

# Naive Bayes with GridSearchCV, tuning var_smoothing parameter
grid = {'var_smoothing': np.logspace(0, -9, num=50)}
naive_bayes = GaussianNB()
naive_bayes_cv = GridSearchCV(naive_bayes, grid, cv=5)
naive_bayes_cv.fit(X_train, Y_train)
naive_bayes_best = naive_bayes_cv.best_estimator_
print(f'Best Parameters: {naive_bayes_best}')

# Make predictions and evaluate performance
y_pred = naive_bayes_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')

In [None]:
# Naive Bayes with GridSearchCV, tuning var_smoothing parameter
grid = {'var_smoothing': np.logspace(0, -9, num=50)}
naive_bayes = GaussianNB()
naive_bayes_cv = GridSearchCV(naive_bayes, grid, cv=10)
naive_bayes_cv.fit(X_train, Y_train)
naive_bayes_best = naive_bayes_cv.best_estimator_
print(f'Best Parameters: {naive_bayes_best}')

# Make predictions and evaluate performance
y_pred = naive_bayes_best.predict(X_test)
r2 = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
print(f'R-squared: {r2:.4f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

In [None]:
#CLUSTERING 

# Initialize the StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

autoCrime_pb= crime[['pctUrban', 'medIncome', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad',
                     'PctBSorMore','PctEmploy', 'PctUnemployed',  'PctVacMore6Mos',
                     'PctVacantBoarded','agePct12t29','agePct16t24','agePct65up','RentMedian',
                     'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans','autoTheft','HousVacant']]
autoCrime_pb =  autoCrime_pb.apply(pd.to_numeric, errors='coerce')
autoCrime_pB = autoCrime_pb.replace('?', float('nan'))
autoCrime_pb = autoCrime_pb.dropna()

AutoCrime_standard = pd.DataFrame(StandardScaler().fit_transform(autoCrime_pb),columns = autoCrime_pb.columns)

In [None]:
# Define a range of possible k values
k_values = range(2, 20)

# Store results
inertia = []
silhouette_scores = []

# Evaluate K-Means for each k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=21)
    cluster_labels = kmeans.fit_predict(AutoCrime_standard)

    # Append the inertia and silhouette score
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(AutoCrime_standard, cluster_labels))

# Determine the optimal k using the Elbow Method
optimal_k_elbow = np.diff(inertia).argmin() + 5  # +5 because np.diff reduces the array length by 1

# Determine the optimal k using the Silhouette Score
optimal_k_silhouette = np.argmax(silhouette_scores) + 5  # +5 because range starts from 5

# Plot the Elbow Method
plt.figure(figsize=(10, 5))
plt.plot(k_values, inertia, 'bx-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.axvline(x=optimal_k_elbow, color='r', linestyle='--')
plt.show()

# Plot the Silhouette Scores
plt.figure(figsize=(10, 5))
plt.plot(k_values, silhouette_scores, 'bx-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.axvline(x=optimal_k_silhouette, color='r', linestyle='--')
plt.show()

# Display the optimal k values
print(f"Optimal number of clusters (Elbow Method): {optimal_k_elbow}")
print(f"Optimal number of clusters (Silhouette Score): {optimal_k_silhouette}")

In [None]:
X = np.array(autoCrime_pb.loc[:,['pctUrban', 'medIncome','PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore','PctEmploy',
                                 'PctUnemployed', 'HousVacant', 'PctVacantBoarded','PctVacMore6Mos','agePct12t29','agePct16t24','agePct65up','RentMedian',
                                 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans','autoTheft',]])    \
                        .reshape(-1, 21)

In [None]:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html



from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm


range_n_clusters = [2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=21)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k")

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(
        "Silhouette analysis for KMeans clustering on data with n_clusters = %d"
        % n_clusters,
        fontsize=14,
        fontweight="bold",
    )

plt.show()