In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 

# Change these to local file if available
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
original_train = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?", skipinitialspace=True)
# This will download 1.9M
original_test = pd.read_csv(test_url, names=features, sep=r'\s*,\s*', 
                            engine='python', na_values="?", skiprows=1, skipinitialspace=True)

original_train

In [None]:
original_test

In [None]:
original_train.isnull().sum()

In [None]:
original_test.isnull().sum()

In [None]:
%matplotlib inline

counts = original_train['Race'].value_counts()
labels = counts.index

#Plot pie chart
plt.pie(counts, startangle=90)
plt.legend(labels, loc=2,fontsize=15)
plt.title("Race",size=20)


In [None]:
%matplotlib inline

counts = original_train['Sex'].value_counts()
labels = counts.index

#Plot pie chart
plt.pie(counts, startangle=90)
plt.legend(labels, loc=2,fontsize=15)
plt.title("Gender",size=20)


In [None]:
original = pd.concat([original_train, original_test])
labels= original['Target']

labels_train = original_train['Target']

labels_test = original_test['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)

labels_train = labels_train.replace('<=50K', 0).replace('>50K', 1)
labels_test = labels_test.replace('<=50K.', 0).replace('>50K.', 1)
original_train['Income'] = labels_train
original_test['Income'] = labels_test
original_train
original['Income'] = labels

men_with_high_income = len(original[(original['Sex']=='Male') & (original['Income']==1)])
men_with_low_income = len(original[(original['Sex']=='Male') & (original['Income']==0)])
print(men_with_high_income)
data_size_men  = len(original[(original['Sex']=='Male')])
data_size_women  = len(original[(original['Sex']=='Female')])
percentage_men = men_with_high_income/data_size_men
percentage_men_low = men_with_low_income/data_size_men
women_with_high_income = len(original[(original['Sex']=='Female') & (original['Income']==1)])
women_with_low_income = len(original[(original['Sex']=='Female') & (original['Income']==0)])

percentage_women = women_with_high_income/data_size_women 

percentage_women_low = women_with_low_income/data_size_women 
arr_m = [men_with_high_income, percentage_men]

arr_w = [women_with_high_income, percentage_women]
results = pd.DataFrame({'Male':arr_m, 'Female':arr_w})
arr = pd.Series(["Total >50K", "Probability/Ratio"])
results = results.set_index(arr)
results


In [None]:
    adult_details = {'Adult Gender Differences': {'Male': {'Count': data_size_men,
                                         'Total =<50K': men_with_low_income,
                                         'Class probability of low income': percentage_men_low ,              
                                         'Total > 50K': men_with_high_income,
                                             'Class probability of high income': percentage_men,                
                                                          },
                               'Female': {'Count': data_size_women,
                                         'Total =<50K': women_with_low_income,
                                         'Class probability of low income': percentage_women_low ,              
                                         'Total > 50K': women_with_high_income,
                                         'Class probability of high income': percentage_women                 
                                                          }}}


    df=pd.DataFrame(adult_details['Adult Gender Differences'])
    df

In [None]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix

lr = linear_model.LogisticRegression()
def trainAndTestLR(data, Y, c, iterator):
    y = Y
    X = data
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.4, random_state=0, shuffle=True)

    lr.C = c
    lr.max_iter = iterator

    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    train_pred = lr.predict(X_train)
    test_pred = lr.predict(X_test)

    # passing actual and predicted values
    cm = confusion_matrix(y_test, test_pred)

    # true Write data values in each cell of the matrix
    sns.heatmap(cm, annot=True)
    plt.savefig('confusionLR.png')


    return accuracy_score(y_pred, y_test)

In [None]:
data = pd.concat([original_train, original_test], axis=0)

data = pd.get_dummies(
    data,
    prefix=None,
    prefix_sep='_',
    dummy_na=False,
    columns=['Workclass','Education','Martial Status','Occupation','Relationship','Race','Country'],
    sparse=False,
    drop_first=False,
    dtype=None
    )
data['Sex'] = data['Sex'].replace('Female', 0).replace('Male', 1)
data['Target'] = data['Target'].replace('<=50K', 0).replace('>50K', 1)
data['Target'] = data['Target'].replace('<=50K.', 0).replace('>50K.', 1)
#parameter LR

C = [ 1]
numIter = [100]

Y = data['Target'].values

train = data.drop(['Target'], axis=1).values

for it in numIter:
    for c in C:
        tmpResLR = trainAndTestLR(train, Y, c, it)
        print(tmpResLR)
        