In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling as pp

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Problem statement and Data Load:**
* We are going to use both classification and regression predictive models to predict the sale price of a house based on it's features.
* We will be using pandas to help load the data that we will be building the model on.
* To ensure that the date and time related variables are not loaded in a numeric format we will ensure to specify the format that it should be loaded in.

In [None]:
#data = pd.read_csv("../input/train.csv",dtype={'Year':'str','Qrtr':'str','Date':'str'})
data = pd.read_csv("../input/train.csv",dtype={'YearBuilt':'str','YrSold':'str','GarageYrBlt':'str','YearRemodAdd':'str'})
data.shape

In [None]:
data.info(verbose=True)

**Understanding the data set:**
* In the process of model building, it is important for us to understand the business and the data set for which we are building a model.
* This will help us filter out unnecessary varaibles that would add noise and not value to the prediction.
* We can have two approaches to understand the distribution of the variables under consideration,using the describe command and then we get a more elaborate analysis using Pandas profiling

In [None]:
data.describe()

In [None]:
profile = pp.ProfileReport(data)
profile.to_file("HousingSales.html")
pp.ProfileReport(data)

**Data Pre-Processing**
* We will be going through the a series of data pre-processing techniques that helps prepare the data that needs to go into the model. 
* We will be doing some cleaning exercises and some variable transformation techniques too.

**Pre-processing 1: Missing Value Treatment**
* We must first understand the proportion of missing values in a variable. We get this information for each variable pandas profiling output.
* If the proportion of missing values is very high (More than 50%) then we should not consider the variable in the model building exercise as there will be no value add from this variable.
* We have to replace the missing values of a numerical variable from one of the following options:
a) Mean of the variable.
b) Median of the variable.
c) Zero (0)
* The recommended replacement is the median as it ensures that distribution of the variable is not disturbed.
* For the replacement of missing values in categorical variables, we could have one of two options:
a) Mode (The category with the highest frequency).
b) New category of “Not Available”.

In [None]:
missing_list = data.columns[data.isna().any()].tolist()
data.columns[data.isna().any()].tolist()

* We combine the list of variables that we see to have missing values and the pandas profiling output to see which variables should be eliminated.
* We see that the variables 'Alley','MasVnrArea','PoolQC','Fence','MiscFeature' have far too many missing values to be considered in the process of model building and hence we will drop them from the data base in consideration and treat the other missing values. 

In [None]:
data.shape

In [None]:
data_org = data
data_org.shape
data.drop(['Alley','MasVnrArea','PoolQC','Fence','MiscFeature'], inplace=True, axis=1)
data.shape

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical = ['object']

for cols in list((data.select_dtypes(include=numerics)).columns.values):
    data[cols] =  data[cols].replace(np.nan,data[cols].median())
    
for cols in list((data.select_dtypes(include=categorical)).columns.values):
    data[cols] =  data[cols].replace(np.nan,"Not_Available")    

In [None]:
# Checking to see if all missing values have been taken care of 
data.columns[data.isna().any()].tolist()

** Preprocessing 2: Outlier Treatment**
* It a pre-processing technique used to eliminate rare case scenarios in the case if numerical variables that can skew and distort the final result of the model.
* However we need to keep the business context in mind while treating variables for outliers.As outlier treatment alters the distribution of the variable and there may be variables whose distribution needs to be preserved.
* Box-plot and inter-quartile range is the most popular technique for outlier detection.

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

train_numeric = data.select_dtypes(include=numerics)
train_numeric.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
for i in range(0,33):
    plt.figure(figsize=(45, 10))
    b = sns.boxplot(data = train_numeric.iloc[:,i:i+3])
    b.tick_params(labelsize= 30)
    i = i+3 

In [None]:
#varstobetreated = ['Age','avg_change_in_efficiency_6m','avg_spl_inc_amt_3M','avg_deviation_avg_training_peers_3M','avg_perc_deviation_incentive_amt_peers_3M','avg_peer_bktx_cases_ratio_3M','OD_EFFICIENCY_3M','CD_EFFICIENCY_3M','time_last_supervisor_change','incentive_amount_lag_1','incentive_amount_lag_2','incentive_amount_lag_3','incentive_amount_lag_4','incentive_amount_lag_5','incentive_amount_lag_6','peer_od_cases_ratio_lag_2','peer_od_cases_ratio_lag_3','peer_od_cases_ratio_lag_4','AVG_OB_CASES_3M','perc_resignation_under_supervisor_3M']
varstobetreated = list(train_numeric.columns)
for cols in varstobetreated:
    Q1 = data[cols].quantile(0.25)
    Q3 = data[cols].quantile(0.75)
    IQR = Q3 - Q1
    Upper_Limit = Q3 + 1.5*IQR
    Lower_Limit = Q1 - 1.5*IQR
    data[cols] = np.where(data[cols] > Upper_Limit,Upper_Limit,data[cols])
    data[cols] = np.where(data[cols] < Lower_Limit,Lower_Limit,data[cols])

**Preprocessing 3: Filtering Variables**

**Categorical Variable Grouping**
* Python only considers for numerical varaibles while building a predictive model So we have to convert independant categorical variables in to dummy variables. Dummy variables are numeric binary (1/0) variables.
* If a categorical variable has k different values then we should create (k-1) dummy (1/0) variables for each (k-1) values of a variable. If all (k-1) dummy variable’s value is 0 then it indicates kth value of that variable.
* This indicates as the values of a categorical variable increases it will increase the number of independant variables which are getting added into the building of a model, this will make model more complex and may not have enough instances of every value in the categorical variable for the model to learn from.
* So we should ensure to restrict the number of categories of a categorical variable, such that the values in consideration should cover more than 95% of the instances in the considered dataset.
* From the Pandas profiling output we see that there are a few categorical variables with large number of distinct values and we will now attempt to group them.

**Identifying High Cardinality Variables** 
* The Year Variables viz. **YearRemodAdd, YearBuilt, GarageYrBlt** have a lot of distinct values.
* So in our initial iteration we will eliminate these variables (As they have over 50 distinct values), however based on the model's outcome we will consider grouping these variables.
* In the case of date and time varaibles we have to consider the aspect of order but shouldn't ignore the distribution either while grouping them.

**Identifying Low Variation Varaibles** 
* If a variable doesn't have enough variation it will not have any value in identifying the cases and values we want to identify with a predictive model.
* We use Pandas Profiling to identify such varaibles both among numerical and categorical varaibles. We remove these varaibles from our database viz. **BsmtHalfBath, CentralAir, Condition2, Heating, RoofMatl, Street, Utilities** among the categorical variables and **Heating, PoolArea** among the numerical variabes/

In [None]:
# Filtering out variables
data.drop(['BsmtHalfBath', 'CentralAir', 'Condition2', 'Heating', 'RoofMatl', 'Street', 'Utilities', 'Heating', 'PoolArea', 'YearRemodAdd', 'YearBuilt', 'GarageYrBlt'],inplace=True, axis=1)
data.shape

In [None]:
colstokeep = ['Id','MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'HeatingQC', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']

In [None]:
def getFeatures(df , run_id, tuple_cols_toKeep, cat_ResponseVariable):
    #response = df[cat_ResponseVariable]  == 'Y'
    response = df[cat_ResponseVariable]
    if(df is None):
        df = get_data_from_SQL()
    #writeFrameToCSV(df,PROJECT2_HOME+run_id)\n",
    if(tuple_cols_toKeep is None):
        features = df
    else:
        df = df.drop(columns = cat_ResponseVariable)
        features = pd.DataFrame(df, columns = tuple_cols_toKeep)
        features = features.drop(columns = ['Id'])
    features = pd.get_dummies(features,drop_first=True)
    features[cat_ResponseVariable] = response
    features.head()
    return(features)

In [None]:
features = getFeatures(data,'1020',colstokeep,'SalePrice')
features_new = pd.DataFrame(data, columns = colstokeep) 
features_final = pd.concat([features,features_new['Id']], axis=1) 
features_copy = features_final
labelkey = features_final['Id']
labels = features_final['SalePrice']
feature_list = list(features_final.columns)
features_final = features_final.drop(columns = ['SalePrice','BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch','ScreenPorch', 'MiscVal', 'Electrical_Mix'])
features_final = features_final.drop(columns = ['Id'])
features_final.shape

**Preprocessing 3: Filtering Variables**

**Multicollinearity**
* This is not a deal-breaker while dealing with machine learning techniques like random forest and xgboost. However is essential while doing linear and logistic regression. 
* Multicollinearity is a statistical phenomenon in which predictor variables in a logistic regression model are highly correlated.It is common when there are a large number of predictors in the model.
* Multicollinearity can cause unstable estimates and inaccurate variances which affects confidence intervals and hypothesis tests. The existence of collinearity inflates the variances of the parameter estimates, and consequently incorrect inferences about relationships between independant and response variables.
* Using the correlation matrix in pandas profiling output, we can first filter out  independent variables that are highly correlated to one another.
* VIF (Variance Inflation Factor) is one of the helpful metric to detect the multicollinearity. For moderate to large sample sizes, the approach to drop one of the correlated variables was established entirely satisfactory to reduce multicollinearity.
* The loop to identify variables that cause inflation eliminates those variables on account of a particular threshold. We can then compare them to the original dataset and put back variables if needed.


In [None]:
import numpy as np
import pandas as pd
import time
from statsmodels.stats.outliers_influence import variance_inflation_factor    
from joblib import Parallel, delayed

# Defining the function that you will run later
def calculate_vif_(X, thresh=5.0):
    variables = [X.columns[i] for i in range(X.shape[1])]
    dropped=True
    while dropped:
        dropped=False
        #print(len(variables))
        vif = Parallel(n_jobs=-1,verbose=5)(delayed(variance_inflation_factor)(X[variables].values, ix) for ix in range(len(variables)))

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            #print(time.ctime() + ' dropping \'' + X[variables].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables.pop(maxloc)
            dropped=True

    #print('Remaining variables:')
    #print([variables])
    return X[[i for i in variables]]


In [None]:
X2 = calculate_vif_(features_final,5)
X2.shape

features_final = X2

**Model Building**

Python offers two libraries that is useful for model building.

**Sckit-learn**
* The sckit-learn library in python helps build all 3 techniques of model building that we are considering today which are Logistic Regression, Random Forest and XGB.
* Sckit-learn provides for both Random Forest and XGB as both classifiers and regressors.
* So the syntax for all three within this package stays the same.

**Stats Model**
* Logistic Regression can also be done using through the stats-model library instead of the sckit learn library.
* The coefficients between the stats model and sckit learn models vary based on the regularization in the sckit-learn package.


In [None]:
# Libraries that sklearn provides:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as sm

**Database Split: Train and Test split**
* There are two types of validations that are most commonly used for both Classification and Regression predictive models.
    a) Out of Sample Validation
    b) Out of Time Validation.
* In the case of an Out of Time validation we have consider a seperate from a seperate time window that wasn't considered in the base that was used to train
* In the case of an out of sample validation we will have to split the base to seperate the base to train and the base to validate. We will explore the out of sample option for validation in our case.

In [None]:
train_features, test_features, train_labels, test_labels, train_labelkey, test_labelkey = train_test_split(features_final, labels, labelkey, test_size = 0.15, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Training Label Key Shape:', train_labelkey.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Testing Label Key Shape:', test_labelkey.shape)

**Model Build and Validation:**
* As our problem is originally a Regression problem, we will build a Random Forest Regressor. However in the case Regression problems both XgBoost and Random Forest as they are part of the same library, have the same syntax for build and predict.
* For Regression porblems we have two potential options for a validation metric 
    a) MAE (Mean Absolute Error).
    b) MAPE (Mean Absolute Percentage Error)
* We will calculate the validation metric for both train and test base, to ensure that model hasn't been overfit to the training base so as to not be able to generalise and achieve the same level of accuracy for the test base.

In [None]:
rf = RandomForestRegressor(n_estimators = 50, random_state = 42 , max_depth= 5, min_samples_leaf=5)
rf.fit(train_features, train_labels)

**SKLEARN - XGBOOST**

* Sklearn also has a provision for XGBoost while dealing with regression predictive problems.
* Following on the similar syntax lines of Random Forest here is the code for XGBoost:

*from xgboost import XGBRegressor*

*xgb = XGBRegressor(n_estimators=50,random_state=421,max_depth=5,colsample_bytree=0.3)*

In [None]:
predicted_vals_train = rf.predict(train_features)
predicted_vals_train

In [None]:
labelkey_train = np.array(train_labelkey)
labels_train = np.array(train_labels)
predicted_vals_train = rf.predict(train_features)
data_train = pd.concat([pd.DataFrame(labelkey_train), pd.DataFrame(labels_train), pd.DataFrame(predicted_vals_train)], axis=1) 
data_train.shape
data_train.columns = ['Id','Actual','Predicted']
data_train['pred_error'] = (data_train['Actual']-data_train['Predicted']).abs()
data_train['pred_error_percent'] = data_train['pred_error']/data_train['Actual']
print(data_train['pred_error_percent'].mean())

In [None]:
labelkey_test = np.array(test_labelkey)
labels_test = np.array(test_labels)
predicted_vals_test = rf.predict(test_features)
data_test = pd.concat([pd.DataFrame(labelkey_test), pd.DataFrame(labels_test), pd.DataFrame(predicted_vals_test)], axis=1) 
data_test.shape
data_test.columns = ['Id','Actual','Predicted']
data_test['pred_error'] = (data_test['Actual']-data_test['Predicted']).abs()
data_test['pred_error_percent'] = data_test['pred_error']/data_test['Actual']
print(data_test['pred_error_percent'].mean())

**Understanding the contribution of variables**
* Sklearn provides Feature importance for its models that gives us an understanding of the contribution of variables to the prediction.
* This gives us a good sense of the health of the model.
* It also helps in understanding important factors that gives the business good insights in decision making
* If the contribution of any variable or 2 variables like in the case below is significantky higher than the other, it is a good idea to examine the creation and correlation of the variable with the dependant variable

In [None]:
feat_importances = pd.Series(rf.feature_importances_, index= train_features.columns)
feat_importances.nlargest(25).plot(kind='bar')

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
a = []

print("Feature ranking:")
for f in range(train_features.shape[1]):
    print("feature  %d (%f)" % (indices[f], importances[indices[f]]))

**Classification Model Case Study :**
* We will now convert the existing Regression problem to a Classification problem and understand how to solve the same.
* Keeping the preprocessing the same, we will simply convert the dependent varaible to binary and understand how the model build and evaluation differs from that of a regression problem.

In [None]:
SalePrice_CutOff = np.percentile(features_copy['SalePrice'],80)
features_copy['SalePrice_Classification'] =  np.where(features_copy['SalePrice'] >= SalePrice_CutOff,1,0)

In [None]:
features_copy['SalePrice_Classification'].value_counts()

In [None]:
labels_classification = features_copy['SalePrice_Classification']
labelkey_classification = features_copy['Id']
final_colstokeep = list(features_final.columns)
features_final_classification = pd.DataFrame(features_copy,columns=final_colstokeep)
#features_final_classification.reset_index(inplace = True)
features_final_classification.shape 

In [None]:
train_features_class, test_features_class, train_labels_class, test_labels_class,  train_labelkey_class, test_labelkey_class = train_test_split(features_final_classification, labels_classification, labelkey_classification,  test_size = 0.15, random_state = 42)
#train_features_class, test_features_class, train_labels_class, test_labels_class,  train_labelkey_class, test_labelkey_class = train_test_split(features_copy, labels_classification, labelkey_classification,  test_size = 0.15, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Training Label Key Shape:', train_labelkey.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Testing Label Key Shape:', test_labelkey.shape)

**Model Building and Evaluation**
* The sklearn package in python carries Random Forest, XGBoost and Logistic Regression for solving a Classification Predictive Model,
* We will look at one example with XGBoost Classifier, however the other techniques in the same package hold the same syntax
* Classification Models have a few common evaluation metrics that we will consider.
    a) AUC - Area Under the Curve.
    b) Confusion Metric : Recall, Precision, F1-Score, Accuracy. 

In [None]:
xgb_class = XGBClassifier(gamma=0.01, learning_rate=0.1,max_depth=2,n_estimators=70,n_jobs=1,random_state=42)
xgb_class.fit(train_features_class, train_labels_class)

**SKLEARN - RF AND LOGISTIC REGRESSION**

* Sklearn also has a provision for Random Forest and Logistic Regression while dealing with classification predictive problems.
* Following on the similar syntax lines of XGBoost here is the code for Random Forest:

**Random Forest**

*from sklearn.ensemble import RandomForestClassifier*

*rf_class = RandomForestClassifier(n_estimators= 70, random_state = 42,max_depth=2)*

*rf_class.fit(train_features_class, train_labels_class)*

**Logistic Regression**

*from sklearn.linear_model import LogisticRegression*

*logreg = LogisticRegression()*

*logreg.fit(train_features_class, train_labels_class)*

In [None]:
labelkey_class_train = np.array(train_labelkey_class)
labels_class_train = np.array(train_labels_class)
predicted_probs_train = xgb_class.predict_proba(train_features_class)
data_train = pd.concat([pd.DataFrame(labelkey_class_train),pd.DataFrame(labels_class_train), pd.DataFrame(predicted_probs_train,columns=['Col_0','Col_1'])], axis=1) 
data_train.shape
data_train['prob_decile'] = pd.qcut(data_train['Col_1'], 10,labels=False)
data_train.head()

In [None]:
labelkey_class_test = np.array(test_labelkey_class)
labels_class_test = np.array(test_labels_class)
predicted_probs_test = xgb_class.predict_proba(test_features_class)
data_test = pd.concat([pd.DataFrame(labelkey_class_test),pd.DataFrame(labels_class_test), pd.DataFrame(predicted_probs_test,columns=['Col_0','Col_1'])], axis=1) 
data_test.shape
data_test['prob_decile'] = pd.qcut(data_test['Col_1'], 10,labels=False)
data_test.head()

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(labels_class_train, predicted_probs_train[::,1], pos_label=1)
import matplotlib.pyplot as plt
auc_train = metrics.roc_auc_score(labels_class_train, predicted_probs_train[::,1])
print(auc_train)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_train))
plt.legend(loc=4)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(labels_class_test, predicted_probs_test[::,1], pos_label=1)
import matplotlib.pyplot as plt
auc_test = metrics.roc_auc_score(labels_class_test, predicted_probs_test[::,1])
print(auc_test)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_test))
plt.legend(loc=4)

In [None]:
data_train.head()

In [None]:
data_train_copy=data_train
columns_names=['Id','TAG','Col_0','Col_1','prob_decile']
#columns_names=['Id','TAG','Col_0','Col_1','prob_decile','prediction']
data_train.columns = columns_names
data_train.tail()

Base_Considered = data_train.loc[(data_train['prob_decile'] >= 7)]
set_prob = Base_Considered['Col_1'].min()
print(set_prob)
data_train['prediction']=np.where(data_train['Col_1'] >= set_prob ,1,0)
data_train.head()

dt_pivot = pd.pivot_table(data_train, values= 'Id', index= 'prob_decile', columns= 'TAG', aggfunc= np.count_nonzero ,
               margins= True)
dt_pivot.index.name = None
dt_pivot.columns.name = None
dt_pivot.columns = ['Base', 'Responders', 'Total']

table = pd.pivot_table(data_train, values=['Id', 'Col_1'], index=['prob_decile'], aggfunc={'prob_decile': np.count_nonzero,'Col_1': [min, max]})
table['responders']=dt_pivot['Responders']
Base_Considered=table.sort_index(ascending=False)
Base_Considered['cumulative_responders'] = Base_Considered.responders.cumsum()
Base_Considered['responders_perc'] = 100*Base_Considered.cumulative_responders/Base_Considered.responders.sum()
Base_Considered

dt_pivot = pd.pivot_table(data_train, values= 'Id', index= 'TAG', columns= 'prediction', aggfunc= np.count_nonzero ,
               margins= True)
# print(dt_pivot)
confusion_matrix=dt_pivot
confusion_matrix.rename(index={0:'FALSE',1:'TRUE'}, columns={0:'FALSE',1:'TRUE'}, inplace=True)
confusion_matrix

print('Accuracy :')
print((confusion_matrix.iloc[0,0]+confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[2,2]))
print('precision:')
print((confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[2,1]))
print('Recall:')
print((confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[1,2]))
print('F1 Score:')
a=2*((confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[2,1])*(confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[1,2]))
b=((confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[2,1]))+((confusion_matrix.iloc[1,1])/(confusion_matrix.iloc[1,2]))
print(a/b)

In [None]:
feat_importances = pd.Series(xgb_class.feature_importances_, index= train_features_class.columns)
feat_importances.nlargest(25).plot(kind='bar')

In [None]:
importances = xgb_class.feature_importances_
indices = np.argsort(importances)[::-1]
a = []

print("Feature ranking:")
for f in range(train_features_class.shape[1]):
    print("feature  %d (%f)" % (indices[f], importances[indices[f]]))

**Alternate Package for Logistic Regression**

* However in regards to logistic regression, Sklearn is not the only package under consideration.
* Stats Model is also a package that offers to build logistic regression in python.

In [None]:
#train_logit = pd.concat([train_labels_class,train_features_class], axis=1)
#train_cols = train_logit.columns[1:]

import statsmodels.api as sm
logit = sm.Logit(train_labels_class,train_features_class)
#logit = sm.Logit(train_logit['SalePrice_Classification'],train_logit[train_cols])

# fit the model
result = logit.fit(method = 'bfgs')

In [None]:
result.summary2()

In [None]:
predicted_probs_train = result.predict(train_features_class)
data_train = pd.concat([pd.DataFrame(train_labelkey_class),pd.DataFrame(train_labels_class), pd.DataFrame(predicted_probs_train)], axis=1)
data_train.shape
data_train.columns = ['Id','OrginalFlag','PredictedProbability']
data_train['prob_decile'] = pd.qcut(data_train['PredictedProbability'], 10,labels=False)
data_train.head()

predicted_probs_test = result.predict(test_features_class)
data_test = pd.concat([pd.DataFrame(test_labelkey_class),pd.DataFrame(test_labels_class), pd.DataFrame(predicted_probs_test)], axis=1) 
data_test.shape
data_test.columns = ['Id','OrginalFlag','PredictedProbability']
data_test['prob_decile'] = pd.qcut(data_test['PredictedProbability'], 10,labels=False)
data_test.head()

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(train_labels_class, predicted_probs_train, pos_label=1)
import matplotlib.pyplot as plt
auc_train = metrics.roc_auc_score(train_labels_class, predicted_probs_train)
print(auc_train)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_train))
plt.legend(loc=4)

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(test_labels_class, predicted_probs_test, pos_label=1)
import matplotlib.pyplot as plt
auc_test = metrics.roc_auc_score(test_labels_class, predicted_probs_test)
print(auc_test)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc_test))
plt.legend(loc=4)