In [None]:
import os
import wget
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # unused, but needed
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifer


In [None]:
#download data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
wget.download(url, 'communities_crimes.csv')

#save as data frame
data_communities_crimes = pd.read_csv('communities_crimes.csv', header = None)
data_communities_crimes = data_communities_crimes.replace('?' , np.nan)

#train - test split
train_communities_crimes = data_communities_crimes.iloc[: 1495, 5 : ]
test_communities_crimes = data_communities_crimes.iloc[1495 : , 5 : ]

In [None]:
#data imputation function
def impute_data(data, class_col):

    data = data.reset_index(drop = True)
    imputer = SimpleImputer()
    data_imputed = imputer.fit_transform(data.loc[:,data.columns.difference([class_col])])
    imputed_df = pd.DataFrame(data.loc[:,class_col]).join(pd.DataFrame(data_imputed, columns = data.columns.difference([class_col])))

    return imputed_df

#imputing train, test data
data_crime_label = data_communities_crimes.columns[-1]

data_communities_crimes = data_communities_crimes.iloc[ : , 5 : ]
data_communities_crimes_imputed = impute_data(data_communities_crimes, data_crime_label)

train_communities_crimes_imputed = impute_data(train_communities_crimes, data_crime_label)
test_communities_crimes_imputed = impute_data(test_communities_crimes, data_crime_label)

In [None]:
#correlation matrix of features
plt.figure(figsize=(15,15))
plt.matshow(data_communities_crimes.corr())
plt.show()


plot_corr_mat = sns.heatmap(data_communities_crimes.corr(),
                                                    vmin=-1,
                                                    cmap='PiYG',
                                                    annot=False);

In [None]:
#calculate CV (= std/mean) of each feature
std_crime = pd.DataFrame(data_communities_crimes.std())
mean_crime = pd.DataFrame(data_communities_crimes.mean())
cv_crime = std_crime / mean_crime
cv_crime = cv_crime.sort_values(by = 0, ascending = False)

#pick highest CV
num_crime = int(np.floor(np.sqrt(128)))
features_crime = cv_crime.index[:num_crime]
cv_highest_crime = pd.DataFrame(data_communities_crimes[data_crime_label]).join(train_communities_crimes_imputed.loc[ : , features_crime])

In [None]:
#scatterplot
def draw_scatterplot(dataframe, name) :
    num_plots = (len(dataframe.columns) - 1)
    for i in range(1 , len(dataframe.columns)):
        ax = fig.add_subplot(np.ceil(num_plots/3),3,i)
        scatterplot = sns.scatterplot(data = dataframe, x = dataframe.columns[i], y = dataframe.columns[0],
                              hue = dataframe.columns[0])

        scatterplot.legend_.remove()
    handles, labels = ax.get_legend_handles_labels()
    #fig.legend(title = "class", handles = handles, labels = labels,
    #           bbox_to_anchor = [.015, .5], loc = 'center left')
    fig.savefig(name)

fig = plt.figure(figsize = (15,20))
fig.subplots_adjust(hspace = .4, wspace = .4)
draw_scatterplot(cv_highest_crime, 'plot_scatter_cv_crime.png')

In [None]:
#linear regression
train_crime_x = train_communities_crimes_imputed.iloc[:, 1:]
train_crime_y = train_communities_crimes_imputed.iloc[:, 0]
test_crime_x = test_communities_crimes_imputed.iloc[:, 1:]
test_crime_y = test_communities_crimes_imputed.iloc[:, 0]

lr = LinearRegression()
lr.fit(train_crime_x, train_crime_y)
test_error_crime = 1 - lr.score(test_crime_x, test_crime_y)

In [None]:
#Ridge regression
alpha = [10**i for i in range(-5,3)]
best_alpha_ridge = None
best_score_ridge = float('-inf')

for a in alpha:
    ridge = Ridge(alpha = a)
    cv_results = cross_validate(ridge, train_crime_x, train_crime_y, cv = 10)
    score = np.mean(cv_results['test_score'])
    if(score > best_score_ridge) :
        best_score_ridge = score
        best_alpha_ridge = a

ridge_model = Ridge(alpha = best_alpha_ridge)
ridge_model.fit(train_crime_x, train_crime_y)
test_error_crime_ridge = 1 - ridge_model.score(test_crime_x, test_crime_y)

In [None]:
#Lasso
best_alpha_lasso = None
best_score_lasso = float('-inf')

for a in alpha:
    lasso = Lasso(alpha = a)
    cv_results = cross_validate(lasso, train_crime_x, train_crime_y, cv = 10)
    score = np.mean(cv_results['test_score'])
    if(score > best_score_lasso):
        best_score_lasso = score
        best_alpha_lasso = a

lasso_model = Lasso(alpha = best_alpha_lasso)
lasso_model.fit(train_crime_x, train_crime_y)
test_error_crime_lasso = 1 - lasso_model.score(test_crime_x, test_crime_y)
lasso_variables = train_crime_x.columns[lasso_model.coef_ == 0]

In [None]:
#PCR
best_score_pcr = float('-inf')
best_m_pcr = None;
for i in range(1, len(train_crime_x.columns)):
    pca = PCA()
    train_crime_x_reduced = pd.DataFrame(pca.fit_transform(train_crime_x))
    cv_results = cross_validate(lr, train_crime_x_reduced.iloc[:,:i], train_crime_y, cv = 10)
    score = np.mean(cv_results['test_score'])
    if(score > best_score_pcr):
        best_score_pcr = score
        best_m_pcr = i

pca = PCA()
train_crime_x_reduced = pd.DataFrame(pca.fit_transform(train_crime_x)).iloc[:,:best_m_pcr]
lr.fit(train_crime_x_reduced, train_crime_y)
test_crime_x_reduced = pd.DataFrame(pca.transform(test_crime_x)).iloc[:,:best_m_pcr]
test_error_crime_pcr = 1 - lr.score(test_crime_x_reduced, test_crime_y)

In [None]:
#xgboost
d_train_crime_x = xgb.DMatrix(train_crime_x)
d_test_crime_x = xgb.DMatrix(test_crime_x)

