# Lasso Regression 

## Implementation to get the top features that impact the label

### Import the necessary packages

In [1]:
import pyodbc 
import pandas as pd
import json
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np


### Fetch data from sql server that will be used for Training

In [2]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=LAPTOP-HVRMUNPF;'
                      'PORT=1433;'
                      'Database=RENTERS_STP;'
                      'Trusted_Connection=yes;'
                      )

query = 'SELECT  * from [RENTERS_STP].[dbo].[DR_DetailedRequest_withNulls] where DAYS_DIFF_CLAIM_LOSS_POLICY_EFFECTIVE is not null'

query = 'SELECT  * from [RENTERS_STP].[dbo].[DR_DetailedRequest_withNulls] where [ISO_CLAIMS_DIRECTOR_SCORE] is not null and [ISO_CLAIMS_DIRECTOR_SCORE] not in (0,999)'
    
df = pd.read_sql(query, conn)

print (df.columns)
print (df.shape)


Index(['ANI_CALL_COUNT_FROM_PREV_CLM_CREATION',
       'ANI_CALL_COUNT_UNTIL_CLM_CREATION',
       'ANI_CLAIM_CREATION_DAY_CALL_COUNT', 'ANI_LAST_90_DAYS_CALL_COUNT',
       'ISO_CLAIM_MATCH_COUNT', 'ISO_PRIOR_SIU_INVOLVEMENT_REDFLAG',
       'ISO_RED_FLAGS_COUNT', 'ISO_OTHER_OPEN_CLAIM_IND',
       'ISO_NON_NAME_CLAIM_MATCH_COUNT', 'ISO_PHONE_MATCH_CLAIM_COUNT',
       'ISO_CLAIMS_DIRECTOR_SCORE', 'ISO_PROPERTY_ADD_NETMAP_ZIPMATCH_IND',
       'ISO_DAYS_DIFF_PREV_CURRENT_LOSS', 'POL_ADD_3YR_CLAIM_COUNT',
       'POLICY_ADD_RECORD_TYPE', 'ZIP4_GRADUATE_DEGREE_PERCENT',
       'ZIP4_WHITE_COLLAR_EMPLOYMENT', 'ZIP4_HOMEOWNERS_AND_RELATED_INSURANCE',
       'ZIP4_HOUSEHOLD_EARNING_OVER_100000',
       'ZIP4_SINGLE_PARENTS_WITH_CHILDREN_PERCENT', 'TLO_AGE',
       'TLO_CRIME_DETAILS_CRIME_TYPE',
       'TLO_DAYS_DIFF_BANKRUPTCY_FILED_CLAIM_CREATION',
       'TLO_PERSONAL_JUDGMENT_FILING_COUNT',
       'TLO_PERSONAL_JUDGMENT_FILING_TYPE', 'TLO_PRIOR_CRIME_DETAILS_IND',
       'TLO_RECENT_RI

### Get the 'features' and 'labels' from the dataset. Split the dataset to training and testing datasets


In [3]:
y=df['PREDICTION_VALUE_Y']
df_dummies = pd.get_dummies(df.drop(['PREDICTION_VALUE_Y', 'PREDICTION_VALUE_N'], axis=1)).fillna(0)


### Perform lasso regression

In [None]:
lasso = Lasso(alpha = 0.4)
lasso_coeff = lasso.fit(df_dummies, y).coef_

### Print the important features

In [None]:
coeffDict = dict(zip(df_dummies.columns, lasso_coeff))
importantFeatures = {}
for key,value in coeffDict.items():
    #print ("{} = {}".format(key, val))
    if -0.50 <= value <= 0.50 and round(value,2)==value:
        print ( "{} too low".format(value))
    else:
        #print ( "{} high enough".format(value))
        importantFeatures[key] = value
        
print (importantFeatures)
xvalues = list(importantFeatures.keys())
yvalues = list(importantFeatures.values())

print ("Labels are {} ".format(xvalues))
print ("Values are {} ".format(yvalues))


### Dislplay the important features

In [None]:
%matplotlib inline

plt.plot(np.arange(len(xvalues)), yvalues)
plt.xticks(np.arange(len(xvalues)), xvalues, rotation = 60 , ha='right')
plt.title('Lasso regression coef')
plt.margins(0.02)
plt.show()