In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use("ggplot")

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold,cross_val_score, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.naive_bayes import GaussianNB

Most Features

In [None]:
#crime = pd.read_csv('crimedata2.csv', encoding = "ISO-8859-1")
crime = pd.read_csv('/content/sample_data/crimedata2.csv', encoding = "ISO-8859-1")
crime.info()

In [None]:
crime_pb = crime[['autoTheft','numbUrban', 'pctUrban', 'medIncome', 'pctWWage','pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire',
                  'medFamInc','perCapInc','NumUnderPov', 'PctPopUnderPov','PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu',
                  'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf','PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous',
                 'PctPersOwnOccup', 'PctPersDenseHous', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded','PctVacMore6Mos',
                 'MedRent', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg' ,'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85']]
#Convert '?' to NaN
crime_pb = crime_pb.replace('?', float('nan'))

#Get the correlations
crime_pb = crime_pb.apply(pd.to_numeric, errors='coerce')


#Drop rows with NaN values
crime_pb = crime_pb.dropna()
corr = crime_pb.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

Selected Features

In [None]:
autoCrime_pb= crime[['pctUrban', 'medIncome', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad',
                     'PctBSorMore','PctEmploy', 'PctUnemployed',  'PctVacMore6Mos','numbUrban',
                     'PctVacantBoarded','agePct12t29','agePct16t24','agePct65up','RentMedian',
                     'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans','autoTheft','HousVacant']]

In [None]:
autoCrime_pb =  autoCrime_pb.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_pb = autoCrime_pb.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb = autoCrime_pb.dropna()
autoCrime_pb

In [None]:
# Get the correlations
autoCrime_pb =  autoCrime_pb.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_pB = autoCrime_pb.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb = autoCrime_pb.dropna()
corr = autoCrime_pb.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

In [None]:
autoCrime_pb1= crime[['pctUrban', 'medIncome', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore','PctEmploy', 'PctUnemployed', 'HousVacant','PctVacantBoarded', 'PctVacMore6Mos',
                     'RentMedian', 'PctUsePubTrans','autoTheft']]
autoCrime_pb1 =  autoCrime_pb1.apply(pd.to_numeric, errors='coerce')
# Convert '?' to NaN
autoCrime_p1b = autoCrime_pb1.replace('?', float('nan'))
# Drop rows with NaN values
autoCrime_pb1 = autoCrime_pb1.dropna()

In [None]:
# Get the correlations
corr = autoCrime_pb1.corr()

plt.figure(figsize=(20, 12))
sns.heatmap(corr,annot=True, fmt=".2f", cmap='viridis')
plt.title('Correlation Heatmap', fontsize=20)
plt.show()

In [None]:
autoCrime_pb.describe()

In [None]:
autoCrime_pb1.describe()

In [None]:
B1 = autoCrime_pb1[["medIncome", "HousVacant","autoTheft"]]
B1.boxplot()

In [None]:
B2 = autoCrime_pb1[['pctUrban', 'PctPopUnderPov', 'PctNotHSGrad', 'PctBSorMore','PctEmploy', 'PctUnemployed', 'PctVacMore6Mos',
                    'PctUsePubTrans']]
B2.boxplot()

In [None]:
B3 = autoCrime_pb1[[ 'RentMedian']]
B3.boxplot()

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()
autoCrime_pb1_standard = pd.DataFrame(StandardScaler().fit_transform(autoCrime_pb1),columns = autoCrime_pb1.columns)

In [None]:
autoCrime_pb1_standard.boxplot()
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.show()

In [None]:
X = autoCrime_pb[['pctUrban', 'medIncome', 'PctPopUnderPov','PctVacantBoarded','numbUrban',
       'PctNotHSGrad', 'PctBSorMore', 'PctEmploy', 'PctUnemployed', 'HousVacant',
        'PctVacMore6Mos', 'agePct12t29','agePct16t24', 'agePct65up', 'RentMedian',
        'racepctblack','racePctWhite', 'racePctAsian', 'racePctHisp', 'PctUsePubTrans']]
Y= autoCrime_pb['autoTheft']

In [None]:
# Data partition. Random state can be any number, just has to be consistent throughout the code
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state =21) # Random state can be any number, just has to be consistent throughout the code

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=21),
    'Random Forest': RandomForestRegressor(random_state=21),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingRegressor(random_state=21)
}


In [None]:
# k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=21)

# https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
results = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_mean_squared_error')
    rmse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_root_mean_squared_error')
    r2_scores = cross_val_score(model, X, Y, cv=kf, scoring='r2')

    results[name] = {
        'MSE': mse_scores,
        'MSE Mean': np.mean(mse_scores),
        'MSE Std': np.std(mse_scores),
        'RMSE': rmse_scores,
        'RMSE Mean': np.mean(rmse_scores),
        'RMSE Std': np.std(rmse_scores),
        'R2': r2_scores,
        'R2 Mean': np.mean(r2_scores),
        'R2 Std': np.std(r2_scores)
    }

# Print Results
for name, metrics in results.items():
    print(f"\n{name}")
    print(f"Mean MSE: {metrics['MSE Mean']:.2f}, Std MSE: {metrics['MSE Std']:.2f}")
    print(f"Mean RMSE: {metrics['RMSE Mean']:.2f}, Std RMSE: {metrics['RMSE Std']:.2f}")
    print(f"Mean R2: {metrics['R2 Mean']:.2f}, Std R2: {metrics['R2 Std']:.2f}")

In [None]:
#k = 10
kf = KFold(n_splits=10, shuffle=True, random_state=21)    # 10 folds
results1 = {}

for name, model in models.items():
    mse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_mean_squared_error')
    rmse_scores = -cross_val_score(model, X, Y, cv=kf, scoring = 'neg_root_mean_squared_error')
    r2_scores = cross_val_score(model, X, Y, cv=kf, scoring='r2')

    results1[name] = {
        'MSE': mse_scores,
        'MSE Mean': np.mean(mse_scores),
        'MSE Std': np.std(mse_scores),
        'RMSE': rmse_scores,
        'RMSE Mean': np.mean(rmse_scores),
        'RMSE Std': np.std(rmse_scores),
        'R2': r2_scores,
        'R2 Mean': np.mean(r2_scores),
        'R2 Std': np.std(r2_scores)
    }

# Print Results
for name, metrics in results1.items():
    print(f"\n{name}")
    print(f"Mean MSE: {metrics['MSE Mean']:.2f}, Std MSE: {metrics['MSE Std']:.2f}")
    print(f"Mean RMSE: {metrics['RMSE Mean']:.2f}, Std RMSE: {metrics['RMSE Std']:.2f}")
    print(f"Mean R2: {metrics['R2 Mean']:.2f}, Std R2: {metrics['R2 Std']:.2f}")

In [None]:
# LINEAR REGRESSION

k = 5
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Fit regression model
reg = LinearRegression()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []



# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    reg.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = reg.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")


In [None]:
# DECISON TREE

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

# Initialize the model
dt = DecisionTreeRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    dt.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = dt.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
# RANDOM FOREST

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


rf = RandomForestRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    rf.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = rf.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#GRADIENT BOOSTING

from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingRegressor(random_state=21)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
#crossvalidation = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    gbc.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = gbc.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")

In [None]:
#NAIVE BAYES

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
rmse_scores = []

# Initialize k-fold cross-validation
crossvalidation = KFold(n_splits=k, shuffle=True, random_state=21)

# Perform k-fold cross-validation
for train_index, test_index in crossvalidation.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model
    nb.fit(X_train, Y_train)

    # Predict on the test set
    y_pred = nb.predict(X_test)

    # Calculate metrics
    r2 = r2_score(Y_test, y_pred)
    mse = mean_squared_error(Y_test, y_pred)
    rmse = np.sqrt(mse)

    # Append metrics to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)

    # Print metrics for the current fold
    print(f"Fold results: R-squared = {r2:.4f}, MSE = {mse:.4f}, RMSE = {rmse:.4f}")

# Print average metrics across all folds
print(f"\nAverage results: R-squared = {np.mean(r2_scores):.4f}, MSE = {np.mean(mse_scores):.4f}, RMSE = {np.mean(rmse_scores):.4f}")



In [None]:
#model for k =10

k = 10
crossvalidation = KFold(n_splits=k, random_state=21, shuffle=True)