In [1]:
import pandas as pd
import numpy as np
import matplotlib
import re
import statsmodels.api as sm
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from ggplot import *


In [2]:
input_data = '../pa2/cs-training.csv'

In [3]:
df = pd.read_csv(input_data, index_col=0)

In [4]:
df.columns.values

array(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'], dtype=object)

In [7]:
a = df.iloc(['age'])
a

<pandas.core.indexing._iLocIndexer at 0x10e671f98>

In [6]:
print(type(df.describe()))

<class 'pandas.core.frame.DataFrame'>


In [7]:
summary = df.describe()

In [8]:
print(df.mode())

   SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines  age  \
0                 0                                     0   49   

   NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
0                                     0          0           5000   

   NumberOfOpenCreditLinesAndLoans  NumberOfTimes90DaysLate  \
0                                6                        0   

   NumberRealEstateLoansOrLines  NumberOfTime60-89DaysPastDueNotWorse  \
0                             0                                     0   

   NumberOfDependents  
0                   0  


In [9]:
print(df.shape)

(150000, 11)


In [10]:
print(df.mode().shape)

(1, 11)


In [11]:
summary = df.describe().append(df.mode())

In [12]:
print(summary.shape)

(9, 11)


In [13]:
summary = summary.T

In [14]:
print(summary.shape)

(11, 9)


In [15]:
summary.rename(columns = {0:'mode'}, inplace=True)

In [16]:
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,mode
SeriousDlqin2yrs,150000,0.06684,0.249746,0,0.0,0.0,0.0,1,0
RevolvingUtilizationOfUnsecuredLines,150000,6.048438,249.755371,0,0.029867,0.154181,0.559046,50708,0
age,150000,52.295207,14.771866,0,41.0,52.0,63.0,109,49
NumberOfTime30-59DaysPastDueNotWorse,150000,0.421033,4.192781,0,0.0,0.0,0.0,98,0
DebtRatio,150000,353.005076,2037.818523,0,0.175074,0.366508,0.868254,329664,0
MonthlyIncome,120269,6670.221237,14384.674215,0,3400.0,5400.0,8249.0,3008750,5000
NumberOfOpenCreditLinesAndLoans,150000,8.45276,5.145951,0,5.0,8.0,11.0,58,6
NumberOfTimes90DaysLate,150000,0.265973,4.169304,0,0.0,0.0,0.0,98,0
NumberRealEstateLoansOrLines,150000,1.01824,1.129771,0,0.0,1.0,2.0,54,0
NumberOfTime60-89DaysPastDueNotWorse,150000,0.240387,4.155179,0,0.0,0.0,0.0,98,0


In [17]:
print(summary.shape)

(11, 9)


In [18]:
count_nan = pd.Series(len(df) - df.count(), name = "missing_vals").T

In [19]:
print(count_nan)

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
Name: missing_vals, dtype: int64


In [20]:
summary = summary.append(count_nan).T

In [21]:
summary

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,missing_vals
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0,
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221237,8.45276,0.265973,1.01824,0.240387,0.757222,
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.674215,5.145951,4.169304,1.129771,4.155179,1.115086,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0,
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0,
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0,
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0,
mode,0.0,0.0,49.0,0.0,0.0,5000.0,6.0,0.0,0.0,0.0,0.0,
DebtRatio,,,,,,,,,,,,0.0


In [22]:
variables = list(df.columns.values)

In [23]:
for var in variables:
    plt = ggplot(aes(x=var),data=df[var])+geom_histogram(binwidth=50)
    break

In [24]:
fig = df.age.hist()

In [25]:
plt.draw()



ValueError: If using all scalar values, you must pass an index

In [None]:
np.log1p(df.MonthlyIncome).hist()

In [26]:
fig = df.hist()

In [27]:
plt.show()

AttributeError: 'ggplot' object has no attribute 'show'

In [None]:
df['age'].count()

In [None]:
df['MonthlyIncome'].count()

In [None]:
type(df['MonthlyIncome'].count())

In [28]:
a1 = df['MonthlyIncome'].count()

In [29]:
a1>0

True

In [30]:
a1==2

False

In [33]:
ggplot(aes(x='age'), data=df['age']) + \
    geom_histogram()



ValueError: If using all scalar values, you must pass an index

In [34]:
mean = round(df['age'].mean(),2)

In [36]:
mean= round(mean,2)

In [37]:
mean

52.299999999999997