# Developing Lasso and Ridge regressions

### We're using Lasso to "automatically" reduce the number of features.

In [12]:
import ipynb
from ipynb.fs.full.joinFeatures import mergedData
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
from statsmodels.graphics.regressionplots import plot_leverage_resid2
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import os
#print(os.listdir("./"))
output_notebook()

plt.rc("figure", figsize=(12,8))
plt.rc("font", size=10)


### Load data

1. Select a state
2. Create dummy variables


In [13]:
allData1 = mergedData()
allData1 = allData1[allData1["State"]=="OHIO"]
allData1["YearDummy2005"] = 0
allData1.loc[allData1.Year > 2005, 'YearDummy2005'] = 1

allData1["YearDummy1980"] = 0
allData1.loc[allData1.Year > 1980, 'YearDummy1980'] = 1

allData1["YearDummy2013"] = 0
allData1.loc[allData1.Year > 2013, 'YearDummy2013'] = 1

model_allStates_maxTemp1.csv
model_allStates_minTemp1.csv
model_allStates_pdsi1.csv
model_allStates_pmdi1.csv
model_allStates_podcount.csv
model_allStates_precip.csv
model_allStates_quality.csv
model_allStates_yields.csv
model_allStates_zndx1.csv


### Scikit learn is a powerful library for creating and testing models.  

In [14]:
#Scikit learn
from sklearn.feature_selection import VarianceThreshold
#from sklearn.svm import LinearSV
from sklearn import linear_model 
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import PolynomialFeatures

### Grid search over alpha hyperparameter

Essentially, the idea is to find the best alpha parameters (alpha 0 is regression, higher number reduce the number of features).  

1. Transform the data using Polynomials (Year, Tmp: Year, Year², Tmp, Tmp², Year*Tmp, Year²*Tmp, Year*Tmp²)
2. Reduce the number of features by comparing training and test data.
3. Window, e.g., train on data 1925-1990, test on 1991-1995, train on data 1925-1995, test on 1996-2000 ....

In [21]:
def changePolyNames(polyX, poly, features):
    features = pd.DataFrame(polyX, columns=poly.get_feature_names(features))
    features.columns = [w.replace(' ', '_') for w in features.columns]
    features.columns = [w.replace('^2', 'sq') for w in features.columns]
    features.columns = [w.replace('^3', 'cube') for w in features.columns]
    features["Year"] = features["Year"].astype(int)
    return(features)
      
def checkAlphas(myAlpha, allCols, trainData1, testData1):    
    ans1 = []
    for i, j in enumerate(myAlpha):

        #print("j:", j)
        #print("i:", i)

        train1 = trainData1[allCols]
        train2 = train1.dropna()
        features = list(filter(lambda x : x != 'Yield', allCols))

        #if(i == 0):
        #    print("features:", features)

        X = train2[features].values
        y = train2["Yield"].values

        poly = PolynomialFeatures(3)
        polyX = poly.fit_transform(X)
        
        features = changePolyNames(polyX, poly, features)

        clf = linear_model.Lasso(alpha=j, normalize=False, max_iter=100000, tol=1e-100).fit(polyX, y)

        features["Yield"] = y

        dict1 = dict(zip(features, clf.coef_))
        dict2 = dict((k, v) for k, v in dict1.items() if np.abs(v) >= 1e-7)    

        nms = list(dict2.keys())
        nms3 = [w.replace(' ', '_') for w in nms]
        nms3 = [w.replace('^2', 'sq') for w in nms3]
        nms3 = [w.replace('^3', 'cube') for w in nms3]
        nms = '+'.join(nms3)
        nms2 = 'Yield ~ ' + nms
        print("Model:", nms2)
        print('\n')
        model_001 = smf.ols(nms2 , data=features).fit()

        ####### Transform test data

        features = list(filter(lambda x : x != 'Yield', testData1))
        #print(features)

        X = testData1[features].values

        realY = testData1["Yield"]    
        polyX = poly.fit_transform(X)

        featurestest = changePolyNames(polyX, poly, features)
        
        #if(i == 0):
            #print("Models:", nms3)
        
        test3 = featurestest[nms3]  #has to be originally identified features

        predictions =  model_001.predict(test3) # predict out of sample

        ans = np.sum(np.square(predictions - realY.values))    
        
        ans1.append(ans)
        
    return(ans1)
        
       
######################################################        
######################################################

testYear = 2015

allCols = ["Yield", "Year", "Pmdi1_Jul", "Pmdi1_Aug", "Precip_Aug", "Maxtemp_Aug", "Maxtemp_Jun", "Precip_Jun", "Precip_Jul"]
allCols = ["Yield", "Year", "YearDummy2013", "Precip_Aug", "Maxtemp_Aug", "Maxtemp_Jun", "Precip_Jun", "Maxtemp_Jul", "Precip_Jul"]
allCols = ["Yield", "Year", "YearDummy2013", "Maxtemp_Jun", "Maxtemp_Jul", "Maxtemp_Aug", "Precip_Jun","Precip_Jul", "Precip_Aug"]  
allCols = ["Yield", "Year", "YearDummy2013", "Precip_Jun","Precip_Jul", "Precip_Aug"] 
#allCols = ["Yield", "Year", "Precip_Aug"]

allData2 = allData1[allCols]

trainData = allData2[allData2["Year"] <= testYear]
testData  = allData2[allData2["Year"] > testYear]

trainData = trainData.dropna()
testData  = testData.dropna()

###################
# Train model using transformed data
trainData = trainData[trainData["Year"] <= testYear]
len1 = trainData.shape[0]
myAlpha = [.0001, .001, .01, .1, .5, .75, 1, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 3000, 5000, 10000]
myAlpha = [.0001, .001, .01, .1, 1, 20, 100, 200, 500]

yrs1 = [0,5,10,15,20]

ansAll = []
for q in yrs1:
    goBack = 25 - q
    goForward = 5
    trainData1 = trainData.iloc[0:(len1-goBack), :]
    #print("trainData1:", trainData1)
    testData1  = trainData.iloc[trainData1.shape[0]:trainData1.shape[0]+goForward, :]
    #print("testData1:", testData1)        
    ans2 = checkAlphas(myAlpha, allCols, trainData1, testData1)
    df1 = pd.DataFrame({"Alpha": myAlpha, "Scores": ans2})
    df2 = df1.loc[df1['Scores'].idxmin()]
    ansAll.append(df1)

ansAll

Model: Yield ~ Year+Precip_Jun+Precip_Jul+Precip_Aug+Yearsq+Year_Precip_Jun+Year_Precip_Jul+Year_Precip_Aug+Precip_Junsq+Precip_Jun_Precip_Jul+Precip_Jun_Precip_Aug+Precip_Julsq+Precip_Jul_Precip_Aug+Precip_Augsq+Yearcube+Yearsq_Precip_Jun+Yearsq_Precip_Jul+Yearsq_Precip_Aug+Year_Precip_Junsq+Year_Precip_Jun_Precip_Jul+Year_Precip_Jun_Precip_Aug+Year_Precip_Julsq+Year_Precip_Jul_Precip_Aug+Year_Precip_Augsq+Precip_Juncube+Precip_Junsq_Precip_Jul+Precip_Junsq_Precip_Aug+Precip_Jun_Precip_Julsq+Precip_Jun_Precip_Jul_Precip_Aug+Precip_Jun_Precip_Augsq+Precip_Julcube+Precip_Julsq_Precip_Aug+Precip_Jul_Precip_Augsq+Precip_Augcube


Model: Yield ~ Year+Yearsq+Year_Precip_Jun+Year_Precip_Jul+Year_Precip_Aug+Precip_Junsq+Precip_Jun_Precip_Jul+Precip_Julsq+Precip_Jul_Precip_Aug+Precip_Augsq+Yearcube+Yearsq_Precip_Jun+Yearsq_Precip_Jul+Year_Precip_Junsq+Year_Precip_Jun_Precip_Jul+Year_Precip_Jun_Precip_Aug+Year_Precip_Julsq+Year_Precip_Jul_Precip_Aug+Year_Precip_Augsq+Precip_Juncube+Precip_Junsq

[      Alpha      Scores
 0    0.0001  180.637411
 1    0.0010  449.748871
 2    0.0100  825.688086
 3    0.1000  129.362933
 4    1.0000   23.358454
 5   20.0000   60.613215
 6  100.0000   80.688709
 7  200.0000  694.954243
 8  500.0000  683.667751,
       Alpha       Scores
 0    0.0001  1766.838466
 1    0.0010   397.459559
 2    0.0100   328.418321
 3    0.1000   341.694860
 4    1.0000   185.445898
 5   20.0000   128.367480
 6  100.0000   128.367480
 7  200.0000   149.269016
 8  500.0000   912.043504,
       Alpha      Scores
 0    0.0001  155.134026
 1    0.0010  111.982634
 2    0.0100  129.994699
 3    0.1000  133.927829
 4    1.0000  171.876445
 5   20.0000  155.058506
 6  100.0000  155.058506
 7  200.0000  155.058506
 8  500.0000  496.498001,
       Alpha       Scores
 0    0.0001   221.209926
 1    0.0010   134.272717
 2    0.0100    99.586795
 3    0.1000    98.793463
 4    1.0000    87.862309
 5   20.0000    80.904954
 6  100.0000    98.647147
 7  200.0000   618.576474
 8 