In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv(filepath_or_buffer='data/train_data_features_fix.csv')
data = pd.read_csv(filepath_or_buffer='data/test_data_features.csv')

In [None]:
train.head()

In [None]:
data.head()

In [None]:
data.drop(columns=['id'], axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def preprocess_data(X_train, X_new):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X = scaler.transform(X_new)
    return X

In [None]:
data = preprocess_data(X_train=train, X_new=data)

In [None]:
labels = [
    'ARSON',
    'ASSAULT',
    'BAD CHECKS',
    'BRIBERY',
    'BURGLARY',
    'DISORDERLY CONDUCT',
    'DRIVING UNDER THE INFLUENCE',
    'DRUG/NARCOTIC',
    'DRUNKENNESS',
    'EMBEZZLEMENT',
    'EXTORTION',
    'FAMILY OFFENSES',
    'FORGERY/COUNTERFEITING',
    'FRAUD',
    'GAMBLING',
    'KIDNAPPING',
    'LARCENY/THEFT',
    'LIQUOR LAWS',
    'LOITERING',
    'MISSING PERSON',
    'NON-CRIMINAL',
    'OTHER OFFENSES',
    'PORNOGRAPHY/OBSCENE MAT',
    'PROSTITUTION',
    'RECOVERED VEHICLE',
    'ROBBERY',
    'RUNAWAY',
    'SECONDARY CODES',
    'SEX OFFENSES FORCIBLE',
    'SEX OFFENSES NON FORCIBLE',
    'STOLEN PROPERTY',
    'SUICIDE',
    'SUSPICIOUS OCC',
    'TREA',
    'TRESPASS',
    'VANDALISM',
    'VEHICLE THEFT',
    'WARRANTS',
    'WEAPON LAWS'
]

In [None]:
import pickle

In [None]:
def make_predictions(X, model_name=None, labels=labels):
    model_path = 'models/'

    if (model_name == 'logistic_regression'):
        model_path = model_path + 'log_reg_classifier.pkl'
    elif (model_name == 'decision_tree'):
        model_path = model_path + 'decision_tree_classifier.pkl'
    elif (model_name == 'random_forest'):
        model_path = model_path + 'random_forest_classifier.pkl'

    preds_file = model_name + '_submissions.csv'
    predictions_path = 'predictions/' + preds_file

    if not os.path.isfile(path=predictions_path):
        model = pickle.load(open(model_path, 'rb'))
        probas = model.predict_proba(X)
        print(probas.shape)

        ids = np.array(list(range(X.shape[0]))).reshape(-1, 1)
        pred_data = np.hstack(tup=(ids, probas))
        pred_df = pd.DataFrame(data=pred_data, columns=['Id'] + labels)
        pred_df['Id'] = pred_df['Id'].astype(int)
        pred_df.to_csv(path_or_buf=predictions_path, index=None)
        return pred_df
    else:
        print('Predictions already exists.')
        print('Model name : ', model_name)
        pred_df = pd.read_csv(filepath_or_buffer=predictions_path)
        return pred_df

    return None

In [None]:
model_name = 'logistic_regression'
lr_preds = make_predictions(X=data, model_name=model_name)

In [None]:
model_name = 'decision_tree'
dt_preds = make_predictions(X=data, model_name=model_name)

In [None]:
model_name = 'random_forest'
rf_preds = make_predictions(X=data, model_name=model_name)