In [1]:
import pandas as pd
import numpy as np
import patsy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#ad_df.replace(' ?', 'Missing', inplace=True)
#ad_df.head()
#ad_df.info()
#replace ? with numpy nan to enable fillna and mean functions
#votes_df.replace(['y', 'n', '?'], [1, 0, np.nan], inplace=True)
#votes_df.fillna(votes_df.mean(), inplace=True)
#votes_df.head(5)

def getCleanData(csvFileName):
    '''Reads in csv in the local folder and returns clean data frame'''
    ad_df = pd.read_csv(csvFileName)

    #Convert int64 to object to use in dmatrix
    ad_df['ADTIND'] = ad_df['ADTIND'].apply(str)
    ad_df['ADTOCC'] = ad_df['ADTOCC'].apply(str)
    ad_df['SEOTR'] = ad_df['SEOTR'].apply(str)
    ad_df['VETYN'] = ad_df['VETYN'].apply(str)

    #Create dummy vars
    ad_df_patsy = patsy.dmatrix("ACLSWKR\
                                +ADTIND\
                                +ADTOCC\
                                +AHGA\
                                +AHSCOL\
                                +AMARITL\
                                +AMJIND\
                                +AMJOCC\
                                +ARACE\
                                +AREORGN\
                                +ASEX\
                                +AUNMEM\
                                +AUNTYPE\
                                +AWKSTAT\
                                +CAPGAIN\
                                +CAPLOSS\
                                +DIVVAL\
                                +FILESTAT\
                                +GRINREG\
                                +GRINST\
                                +HHDFMX\
                                +HHDREL\
                                +MIGMTR1\
                                +MIGMTR3\
                                +MIGMTR4\
                                +MIGSAME\
                                +MIGSUN\
                                +NOEMP\
                                +PARENT\
                                +PEFNTVTY\
                                +PEMNTVTY\
                                +PENATVTY\
                                +PRCITSHP\
                                +SEOTR\
                                +VETQVA\
                                +VETYN-1", data=ad_df, return_type='dataframe')
    
    #Get non categorical fields and replace target with 1 and 0
    ad_df_for_model= ad_df[['AAGE','WKSWORK','YEAR','AHRSPAY','DIVVAL','CAPGAIN','CAPLOSS','NOEMP','INCOME_IND']].replace(['-50000', ' 50000+.'], [0, 1])
    
    #Add dummy matrix
    ad_df_for_model = pd.merge(ad_df_patsy,ad_df_for_model, right_index=True, left_index=True)
    
    print ad_df_for_model.info()
    
    return ad_df_for_model

In [2]:
trainingData = getCleanData('census-income.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199523 entries, 0 to 199522
Columns: 483 entries, ACLSWKR[ Federal government] to INCOME_IND
dtypes: float64(474), int64(9)
memory usage: 736.8 MB
None


In [3]:
#Have to drop one column since it's missing in test data
#The better way is to impute this column in test data
#For now just do MVP
trainingData = trainingData.drop("HHDFMX[T. Grandchild <18 ever marr not in subfamily]", 1)

In [121]:
#ad_df.isnull().sum(axis=0)

In [4]:
#Split for training 
X = trainingData[trainingData.columns[:-1]]
y = trainingData[trainingData.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
#X_KBest = SelectKBest(chi2, k=2).fit_transform(X, y)

In [9]:
#LogisticRegression scores
def getModelPerformanceStat(y_test, y_pred, model_logReg, data_df):
    logisticReg_acu = accuracy_score(y_test, y_pred)
    precision_logReg, recall_logReg, fscore_logReg, support_logReg = score(y_test, y_pred)

    print '\nLogistic Regressioin Stats'
    print('precision: {}'.format(precision_logReg))
    print('recall: {}'.format(recall_logReg))
    print('fscore: {}'.format(fscore_logReg))
    print('support: {}'.format(support_logReg))
    print('Accuracy:{}'.format(logisticReg_acu))
    
    #Coeficients
    coef = {}
    for i in range(len(data_df.columns)):
        coef[data_df.columns.values[i]] = round(model_logReg.coef_[0][i],3)
        #print data_df.columns.values[i], ' ', round(model_logReg.coef_[0][i],3)
        
    importances = model_logReg.coef_
    indices = np.argsort(importances)[::-1]

    
    # Print the feature ranking
    print("Feature ranking:")

    for f in range(data_df.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(data_df.shape[1]), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(data_df.shape[1]), indices)
    plt.xlim([-1, data_df.shape[1]])
    plt.show()
    

In [22]:
y_train.value_counts()

0    149659
1      9959
Name: INCOME_IND, dtype: int64

In [18]:
x_merged=pd.merge(X_train,pd.DataFrame(y_train),right_index=True, left_index=True)

In [23]:
list_index=x_merged[x_merged['INCOME_IND']==0].index


30592

In [27]:
from random import randint
random_ind=[randint(0,90000) for i in range(0,90000)]
#random_ind

In [32]:
x_merged_new=x_merged[list_index[random_ind]]
x_merged_new.head()

IndexError: indices are out-of-bounds

In [10]:
#Build Logistic Regression Model function
logistic = LogisticRegression()
logistic.fit(X_train,y_train)
y_pred = logistic.predict(X_test)
getModelPerformanceStat(y_test, y_pred, logistic, X_train)


Logistic Regressioin Stats
precision: [ 0.96143186  0.71956857]
recall: [ 0.99028867  0.38547255]
fscore: [ 0.97564694  0.50201559]
support: [37482  2423]
Accuracy:0.953564716201
Feature ranking:


IndexError: index 198 is out of bounds for axis 0 with size 1

AttributeError: 'LogisticRegression' object has no attribute 'summary'

In [152]:
#Test the model on testing data
testData = getCleanData('census-income-test.csv')

#Have to add HHDFMX[T. Grandchild <18 ever marr not in subfamily] because it's missing in test data
X_test2 = testData[testData.columns[:-1]]
y_test2 = testData[testData.columns[-1]]
y_test2_pred = logistic.predict(X_test2)
#logisticReg_acu2 = accuracy_score(y_test2, y_test2_pred)
getModelPerformanceStat(y_test2, y_test2_pred)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99762 entries, 0 to 99761
Columns: 482 entries, ACLSWKR[ Federal government] to INCOME_IND
dtypes: float64(473), int64(9)
memory usage: 367.6 MB
None

Logistic Regressioin Stats
precision: [ 0.96114884  0.73530298]
recall: [ 0.99061725  0.3942774 ]
fscore: [ 0.97566058  0.51331159]
support: [93576  6186]
Accuracy:0.953639662397


In [190]:
#for col2 in trainingData.columns:
#    print col2