## Statistical Modeling

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

### Data cleaning

In [3]:
# a little more scrubbing
starting_data = pd.read_csv('cleanedData.csv')
selected = starting_data.drop(['Feet From Curb','Date First Observed'], axis = 1)
selected = selected.dropna()

In [4]:
small_df = selected.iloc[:,1:]

In [14]:
# throws error
#def convert_to_hours(x):
#    mid = int(len(x) / 2)
#    hrs = int(x[0:mid])
#    if (x[len(x) - 1] == 'P' or x[len(x) - 1] == 'A'):
 #       try:
  #          mins = float(x[mid:len(x) - 1])
   #     except ValueError:
     #       print ("error",ValueError,"on line",x)
    #    is_pm = x[len(x) - 1] == 'P'
    #else:
     #   mins = float(x[mid:len(x)])
      #  is_pm = False
   # if (is_pm):
    #    hrs = hrs + 12
    #return hrs

In [13]:
#small_df['Violation Time'] = small_df['Violation Time'].dropna().apply(lambda x: convert_to_hours(x))

### Adding Dummy Variables

In [5]:
one_hot = pd.get_dummies(small_df['Registration State'])
one_hot.columns = ['Reg_State_' + str(col) for col in one_hot.columns]

small_df.drop(columns=['Registration State'],inplace=True)

small_df = small_df.join(one_hot)

In [6]:
one_hot_pl_type = pd.get_dummies(small_df['Plate Type'])
one_hot_pl_type.columns = ['plate_type_' + str(col) for col in one_hot_pl_type.columns]

small_df.drop(columns=['Plate Type'],inplace=True)

small_df = small_df.join(one_hot_pl_type)


In [7]:
one_hot_v_body_type = pd.get_dummies(small_df['Vehicle Body Type'])
one_hot_v_body_type.columns = ['v_body_type_' + str(col) for col in one_hot_v_body_type.columns]

small_df.drop(columns=['Vehicle Body Type'],inplace=True)

small_df = small_df.join(one_hot_v_body_type)

In [8]:
one_hot_v_make = pd.get_dummies(small_df['Vehicle Make'])
one_hot_v_make.columns = ['v_make_' + str(col) for col in one_hot_v_make.columns]

small_df.drop(columns=['Vehicle Make'],inplace=True)

small_df = small_df.join(one_hot_v_make)


In [9]:
one_hot_issuing_agency = pd.get_dummies(small_df['Issuing Agency'])
one_hot_issuing_agency.columns = ['issuing_agency_' + str(col) for col in one_hot_issuing_agency.columns]

small_df.drop(columns=['Issuing Agency'],inplace=True)

small_df = small_df.join(one_hot_issuing_agency)

In [10]:
one_hot_sub_div = pd.get_dummies(small_df['Sub Division'])
one_hot_sub_div.columns = ['sub_div_' + str(col) for col in one_hot_sub_div.columns]

small_df.drop(columns=['Sub Division'],inplace=True)

small_df = small_df.join(one_hot_sub_div)

In [11]:
one_hot_vio_cty = pd.get_dummies(small_df['Violation County'])
one_hot_vio_cty.columns = ['vio_cty_' + str(col) for col in one_hot_vio_cty.columns]

small_df.drop(columns=['Violation County'],inplace=True)

small_df = small_df.join(one_hot_vio_cty)

In [12]:
# throws error
#one_hot_street_name = pd.get_dummies(small_df['Street Name'])
#one_hot_street_name.columns = ['strt_name_' + str(col) for col in one_hot_street_name.columns]

#small_df.drop(columns=['Street Name'],inplace=True)

#small_df = small_df.join(one_hot_street_name)

### Creating a smaller dataset for Statistical Analysis

In [15]:
small_df = small_df.drop(['Street Name', 'Violation Time'], axis = 1)
train_features, test_features, train_outcome, test_outcome = train_test_split(
   small_df.drop(columns=['Violation Code']),      # features
   small_df['Violation Code'],    # outcome
   test_size=0.50 # percentage of data to use as the test set
)

In [None]:
def cleaning(df):
    names = np.empty_like(df.columns.values)
    count = 0;
    temp = df.columns.values
    for col in temp:
        if('/' in col):
            col = col.replace('/', '_')
        if(' ' in col):
            col = col.replace(' ', '_')
        if('-' in col):
            col = col[0:col.find('-')] + '_'+ col[col.find('-') + 1:]
        if(';' in col): 
            col = col[0:col.find(';')] + '_'+ col[col.find(';') + 1:]
        if('.' in col):
            col = col.replace('.', '')
        col = col.replace('+', '_')
        
        names[count] = col
        count += 1
    df.columns = names
    
    return df.select_dtypes(['number']) 

In [47]:
# these characters cause problems for the the forward selection function
train_features = cleaning(train_features)

In [48]:
# memory errors, cut data down even more
train_features['result'] = train_outcome
work_data = train_features.sample(1500)

### Forward Feature Selection

In [49]:
# forward selection model
def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [50]:
model = forward_selected(work_data, 'result')

KeyboardInterrupt: 

In [None]:
model.summary()

### Polynomial Regression

In [55]:
#result = work_data['result']
work_data = work_data.select_dtypes(['number'])
# work_data = work_data.drop('result', axis = 1)

In [68]:
from sklearn.feature_selection import SelectPercentile
selector = SelectPercentile()
poly_reg = LinearRegression()

pipe = make_pipeline(PolynomialFeatures(), selector, poly_reg)
param_grid = {'polynomialfeatures__degree': np.arange(7),
              'selectpercentile__percentile': range(10,100,5),
              'linearregression__fit_intercept': [True, False],
              'linearregression__normalize': [True, False]}

grid = GridSearchCV(pipe, param_grid, cv=7)
grid.fit(work_data, result)
model2 = grid.best_estimator_

  f = msb / msw


ValueError: Found array with 0 feature(s) (shape=(1285, 0)) while a minimum of 1 is required.

In [None]:
test_features = cleaning(test_features)
test_data = test_features.sample(1500)
grid.score(test_features, test_outcome)