In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import cross_validation
%matplotlib inline

# A function that splits a Pandas DataFrame into training and testing
def split_data(data, predict_cols, n_samples=False):
    predict_cols.append('hicov') 
    
    if not n_samples: 
        train_data = data[data.hicov.notnull()][predict_cols].dropna() 
    else: 
        train_data = data[data.hicov.notnull()][predict_cols].sample(n_samples).dropna() 
        
    predict_cols.pop()  # Remove 'hicov'

    X_train, X_test, y_train, y_test = cross_validation.train_test_split( \
    train_data[predict_cols].values, \
    train_data.hicov.values, test_size=0.4)

    return X_train, X_test, y_train, y_test 

In [None]:
#%% Read in data         

print "Reading in the .txt file..."

data = pd.read_csv('coverage_data.txt.gz', header=0, sep="\t", index_col=0)

print "Size of data frame: ", data.shape
print "%.1f million rows" % (data.shape[0]/1.0e6)


# Drop duplicates and NAs 
#data.drop_duplicates(inplace=True)
#data.dropna(inplace=True)

#%%
#sns.pairplot(data, x_vars = ['agep', 'sex'], y_var = 'hcov', size=7, aspect_ratio=.7)

In [None]:
#%% Support vector classification

clf = svm.SVC()
#clf = svm.LinearSVC()

# Most complete columns
#predict_cols = ['agep', 'cit', 'dis', 'fs', 'mar', 'hincp', 'np', 'puma','rac1p', 'sex']
predict_cols = ['agep', 'bld', 'sch', 'cit', 'fs', 'hht', 'hincp', 'mar', 'mv', 'np', 'puma','rac1p', 'sex', 'st', 'type', 'veh']


X_train, X_test, y_train, y_test = split_data(data, predict_cols, 20000)


clf.fit(X_train, y_train)


#predict_data = data[data.hicov.isnull()][predict_cols].dropna()

#clf.predict(predict_data.head(999).values)

print "SVC score: ", clf.score(X_test, y_test)
#import pickle
#s = pickle.dumps(clf)

#%% Naive Bayes
from sklearn.naive_bayes import GaussianNB

#X_train, X_test, y_train, y_test = cross_validation.train_test_split( \
#    train_data[predict_cols].values,\
#    train_data.hicov.values, test_size=0.4, random_state=0)
    

gnb = GaussianNB()

gnb.fit(X_train, y_train)

print "Gaussian score: ", gnb.score(X_test, y_test)

#%% Linear Model
from sklearn import linear_model

#Most predictive features from random forest
predict_cols = ['agep', 'bld', 'cit', 'dis',  'fs', 'hht', 'hincp', 'mar',\
      'noc', 'np', 'puma','rac1p', 'sch', 'sex', 'st']
X_train, X_test, y_train, y_test = split_data(data, predict_cols)


clf = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', verbose=1)
clf.fit(X_train, y_train)

print "Logistic Regression: ", clf.score(X_test, y_test)

#%% Decision Tree
from sklearn import tree
#from sklearn.externals.six import StringIO  
#import pydot_ng as pydot

predict_cols = ['agep', 'bld', 'cit', 'dis',  'fs', 'hht', 'hincp', 'mar',\
      'noc', 'np', 'puma','rac1p', 'sch', 'sex', 'st']

#predict_cols = ['agep', 'bld', 'cit', 'dis',  'fs', 'hht', 'hincp', 'mar',\
#     'mv', 'noc', 'np', 'puma','rac1p', 'sch', 'sex', 'st', 'type', 'veh']

X_train, X_test, y_train, y_test = split_data(data, predict_cols)

depths = range(1,15)
scores = []
for depth in depths: 
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
    

plt.plot(depths, scores)
plt.xlabel('Tree Depth')
plt.ylabel('Score')
#dot_data = StringIO() 
#tree.export_graphviz(clf, out_file=dot_data) 
#graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
#graph.write_pdf("hcov-tree-6.pdf") 

#%% Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

predict_cols = ['agep', 'bld', 'cit', 'dis',  'fs', 'hht', 'hincp', 'mar',\
     'mv', 'noc', 'np', 'puma','rac1p', 'sch', 'sex', 'st', 'type', 'veh']

X_train, X_test, y_train, y_test = split_data(data, predict_cols)

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,\
     max_depth=3, max_features=11)
     
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
#%% Print results from Linear Model
from sklearn import linear_model
clf = linear_model.LogisticRegression()

predict_cols = ['agep', 'bld', 'cit', 'dis',  'fs', 'hht', 'hincp', 'mar',\
     'mv', 'noc', 'np', 'puma','rac1p', 'sch', 'sex', 'st', 'type', 'veh']

predict_cols.append('hicov') 
train_data = data[data.hicov.notnull()][predict_cols].dropna()
predict_cols.pop()  # Remove 'hicov'


X_train, y_train = train_data[predict_cols], train_data.hicov
clf.fit(X_train, y_train)

# Predict scores for hicov == NaN 
predict_data = data[data.hicov.isnull()][predict_cols].dropna()

predicted_proba = clf.predict_proba(predict_data)

predict_data['probability_score'] = predicted_proba[:,0]

predict_data['probability_score'].to_csv('roryhartongredden_datascience1_scores.csv', \
    header=True)