# Testing Lasso and Ridge regressions

### We're using Lasso to "automatically" reduce the number of features.

In [1]:
import ipynb
from ipynb.fs.full.joinFeatures import mergedData
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from statsmodels.graphics.regressionplots import plot_leverage_resid2
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

from bokeh.io import curdoc
from bokeh.layouts import column, row 
from bokeh.plotting import figure, show, output_file, output_notebook
from bokeh.palettes import Spectral11, colorblind, Inferno, BuGn, brewer
from bokeh.models import ColumnDataSource, DataRange1d, Select, HoverTool, value, LabelSet, Legend, ColumnDataSource,LinearColorMapper,BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html
import os
#print(os.listdir("./"))
output_notebook()

plt.rc("figure", figsize=(12,8))
plt.rc("font", size=10)

TOOLS = 'save,pan,box_zoom,reset,wheel_zoom,hover'

### Load data

1. Select a state
2. Create dummy variables


In [141]:
allData1 = mergedData()
allData1 = allData1[allData1["State"]=="OHIO"]
allData1["YearDummy2005"] = 0
allData1.loc[allData1.Year > 2005, 'YearDummy2005'] = 1

allData1["YearDummy1980"] = 0
allData1.loc[allData1.Year > 1980, 'YearDummy1980'] = 1

allData1["YearDummy2013"] = 0
allData1.loc[allData1.Year > 2013, 'YearDummy2013'] = 1

model_allStates_pmdi1.csv
model_allStates_minTemp1.csv
model_allStates_precip.csv
model_allStates_quality.csv
model_allStates_zndx1.csv
model_allStates_pdsi1.csv
model_allStates_podcount.csv
model_allStates_yields.csv
model_allStates_maxTemp1.csv


### Scikit learn is a powerful library for creating models.  

In [142]:
#Scikit learn
from sklearn.feature_selection import VarianceThreshold
#from sklearn.svm import LinearSV
from sklearn import linear_model 
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import PolynomialFeatures

### Grid search over alpha hyperparameter

Essentially, the idea is to find the best alpha parameters (alpha 0 is regression, higher number reduce the number of features).  

1. Transform the data using Polynomials (Year, Tmp: Year, Year², Tmp, Tmp², Year*Tmp, Year²*Tmp, Year*Tmp²)
2. Reduce the number of features by comparing training and test data.
3. Window, e.g., train on data 1925-1990, test on 1991-1995, train on data 1925-1995, test on 1996-2000 ....

In [147]:
def changePolyNames(polyX, poly, features):
    features = pd.DataFrame(polyX, columns=poly.get_feature_names(features))
    features.columns = [w.replace(' ', '_') for w in features.columns]
    features.columns = [w.replace('^2', 'sq') for w in features.columns]
    features.columns = [w.replace('^3', 'cube') for w in features.columns]
    features["Year"] = features["Year"].astype(int)
    return(features)
    
    
def checkAlphas(myAlpha, allCols, trainData1, testData1):    
    
    ans1 = []
    for i, j in enumerate(myAlpha):

        #print("j:", j)
        #print("i:", i)

        train1 = trainData1[allCols]
        train2 = train1.dropna()
        features = list(filter(lambda x : x != 'Yield', allCols))

        #if(i == 0):
        #    print("features:", features)

        X = train2[features].values
        y = train2["Yield"].values

        poly = PolynomialFeatures(3)
        polyX = poly.fit_transform(X)
        
        features = changePolyNames(polyX, poly, features)

        clf = linear_model.Lasso(alpha=j, normalize=False, max_iter=100000, tol=1e-100).fit(polyX, y)

        features["Yield"] = y

        dict1 = dict(zip(features, clf.coef_))
        dict2 = dict((k, v) for k, v in dict1.items() if np.abs(v) >= 1e-7)    

        nms = list(dict2.keys())
        nms3 = [w.replace(' ', '_') for w in nms]
        nms3 = [w.replace('^2', 'sq') for w in nms3]
        nms3 = [w.replace('^3', 'cube') for w in nms3]
        nms = '+'.join(nms3)
        nms2 = 'Yield ~ ' + nms
        #print("Model:", nms2)
        model_001 = smf.ols(nms2 , data=features).fit()

        ####### Transform test data

        features = list(filter(lambda x : x != 'Yield', testData1))
        #print(features)

        X = testData1[features].values

        realY = testData1["Yield"]    
        polyX = poly.fit_transform(X)

        featurestest = changePolyNames(polyX, poly, features)
        
        #if(i == 0):
            #print("Models:", nms3)
        
        test3 = featurestest[nms3]  #has to be originally identified features

        predictions =  model_001.predict(test3) # predict out of sample

        ans = np.sum(np.square(predictions - realY.values))    
        
        ans1.append(ans)
        
    return(ans1)
        
       
######################################################        
######################################################

testYear = 2015

allCols = ["Yield", "Year", "Pmdi1_Jul", "Pmdi1_Aug", "Precip_Aug", "Maxtemp_Aug", "Maxtemp_Jun", "Precip_Jun", "Precip_Jul"]
allCols = ["Yield", "Year", "YearDummy2013", "Precip_Aug", "Maxtemp_Aug", "Maxtemp_Jun", "Precip_Jun", "Maxtemp_Jul", "Precip_Jul"]
allCols = ["Yield", "Year", "YearDummy2013", "Maxtemp_Jun", "Maxtemp_Jul", "Maxtemp_Aug", "Precip_Jun","Precip_Jul", "Precip_Aug"]  
allCols = ["Yield", "Year", "Precip_Aug"]

allData2 = allData1[allCols]

trainData = allData2[allData2["Year"] <= testYear]
testData  = allData2[allData2["Year"] > testYear]

trainData = trainData.dropna()
testData  = testData.dropna()

###################
# Train model using transformed data
trainData = trainData[trainData["Year"] <= testYear]
len1 = trainData.shape[0]
myAlpha = [.0001, .001, .01, .1, .5, .75, 1, 20, 200, 1000, 2000, 3000, 5000, 10000]

yrs1 = [0,5,10,15,20]

ansAll = []
for q in yrs1:
    goBack = 25 - q
    goForward = 5
    trainData1 = trainData.iloc[0:(len1-goBack), :]
    #print("trainData1:", trainData1)
    testData1  = trainData.iloc[trainData1.shape[0]:trainData1.shape[0]+goForward, :]
    #print("testData1:", testData1)        
    ans2 = checkAlphas(myAlpha, allCols, trainData1, testData1)
    df1 = pd.DataFrame({"Alpha": myAlpha, "Scores": ans2})
    df2 = df1.loc[df1['Scores'].idxmin()]
    ansAll.append(df1)

ansAll

[         Alpha       Scores
 0       0.0001    44.787247
 1       0.0010    39.130899
 2       0.0100    39.130899
 3       0.1000    39.207408
 4       0.5000    33.351268
 5       0.7500    33.351268
 6       1.0000    33.351268
 7      20.0000    29.994541
 8     200.0000   964.680106
 9    1000.0000  1073.369012
 10   2000.0000  1073.369012
 11   3000.0000  1073.369012
 12   5000.0000  1073.369012
 13  10000.0000  1073.369012,
          Alpha       Scores
 0       0.0001    43.533544
 1       0.0010    46.658465
 2       0.0100    43.143440
 3       0.1000    43.090986
 4       0.5000    42.639613
 5       0.7500    42.639613
 6       1.0000    42.639613
 7      20.0000    51.159053
 8     200.0000   908.696866
 9    1000.0000  1072.491754
 10   2000.0000  1072.491754
 11   3000.0000  1072.491754
 12   5000.0000  1072.491754
 13  10000.0000  1072.491754,
          Alpha      Scores
 0       0.0001  130.034282
 1       0.0010  115.699263
 2       0.0100  115.699263
 3       0.1000 

In [134]:
def myLasso(trainData, testData, allCols, myAlpha, myMinValue):
    train1 = trainData[allCols]
    train2 = train1.dropna()
    features = list(filter(lambda x : x != 'Yield', allCols))
    
    X = train2[features].values
    y = train2["Yield"].values
    
    poly = PolynomialFeatures(3)
    polyX = poly.fit_transform(X)
    
    features = pd.DataFrame(polyX, columns=poly.get_feature_names(features))
    features.columns = [w.replace(' ', '_') for w in features.columns]
    features.columns = [w.replace('^2', 'sq') for w in features.columns]
    features.columns = [w.replace('^3', 'cube') for w in features.columns]
    
    features["Year"] = features["Year"].astype(int)
    
    clf = linear_model.Lasso(alpha=myAlpha, normalize=False, max_iter=100000, tol=1e-100).fit(polyX, y)
    
    dict1 = dict(zip(features, clf.coef_))
    dict2 = dict((k, v) for k, v in dict1.items() if np.abs(v) >= myMinValue)    
    
    #print("best features:", dict2)
    
    features["Yield"] = y
    
    nms = list(dict2.keys())
    nms3 = [w.replace(' ', '_') for w in nms]
    nms3 = [w.replace('^2', 'sq') for w in nms3]
    nms3 = [w.replace('^3', 'cube') for w in nms3]
  
    nms = '+'.join(nms3)
    
    nms2 = 'Yield ~ ' + nms
    
    model = smf.ols(nms2 , data=features).fit()
    
    
    ####### Transform test data
    test1 = testData[allCols]
    test2 = test1.dropna()
    testfeatures = list(filter(lambda x : x != 'Yield', test2))
    
    X = test2[testfeatures].values
    
    polyX = poly.fit_transform(X)
    
    featurestest = pd.DataFrame(polyX, columns=poly.get_feature_names(testfeatures))
  
    featurestest.columns = [w.replace(' ', '_') for w in featurestest.columns]
    featurestest.columns = [w.replace('^2', 'sq') for w in featurestest.columns]
    featurestest.columns = [w.replace('^3', 'cube') for w in featurestest.columns]
    featurestest["Year"] = featurestest["Year"].astype(int)
    
    test3 = featurestest[nms3]  #has to be originally identified features
    
    predictions =  model.predict(test3) # predict out of sample
    
    return(model, predictions) 

