# Meta Methods with Text Dataset

In [6]:
# Import libraries 

import numpy as np    # Numeric and matrix computation
import pandas as pd   # Optional: good package for manipulating data 
import sklearn as sk  # Package with learning algorithms implemented

twigen = pd.read_csv("dataWithTextWordsPrep.csv", dtype={"gender_gold": str, "profile_yn_gold": str, "tweet_coord": str}, encoding='latin1')
twigen.head()

Unnamed: 0.1,Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,...,evidence,great.,toe,favor,soccer,did.,wheel,shoulder,rank,expected
0,0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,...,0,0,0,0,0,0,0,0,0,0
3,3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,...,0,0,0,0,0,0,0,0,0,0
4,4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
twigen = twigen.drop(['_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at',
               'gender:confidence', 'profile_yn', 'profile_yn:confidence', 'created', 'description',
               'fav_number', 'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage',
               'sidebar_color', 'text', 'tweet_count', 'tweet_created', 'tweet_id', 'tweet_location',
               'retweet_count', 'tweet_coord', 'user_timezone'], axis=1);
twigen = twigen.drop(twigen.columns[0], axis=1)
twigen = twigen[twigen.gender.notnull()]
twigen.head()

Unnamed: 0,gender,you,weather,get,my,from,me,with,that,update,...,evidence,great.,toe,favor,soccer,did.,wheel,shoulder,rank,expected
0,male,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,male,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,male,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,male,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,female,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
def parse_label(gender):
    if gender == 'female': return 0
    if gender == 'male': return 1
    if gender == 'brand': return 2
    assert False # ensure gender is valid

# All values in KNN must be numeric, label included
labels = twigen.gender.apply(parse_label)
data = twigen.drop('gender', axis=1)

print(data.shape)
data.head()

(18836, 2000)


Unnamed: 0,you,weather,get,my,from,me,with,that,update,be,...,evidence,great.,toe,favor,soccer,did.,wheel,shoulder,rank,expected
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Voting scheme

In [10]:
from sklearn.model_selection import cross_val_score
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

cv=None # 3-fold cross-validation by default

clf1 = GaussianNB()

params = {'n_neighbors':list(range(1,30 + 1,2)), 'weights':('distance','uniform')}
knc = KNeighborsClassifier()
clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
clf.fit(data, labels)
print("Best Params fo Knn=",clf.best_params_, "Accuracy=", clf.best_score_)
parval=clf.best_params_

Best Params fo Knn= {'n_neighbors': 29, 'weights': 'distance'} Accuracy= 0.488160968358


In [11]:
clf1 = GaussianNB()

clf2 = KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])

clf3 = DecisionTreeClassifier(criterion='entropy', min_samples_split=101, min_impurity_split=1.3)

#clf4 = SVC(C=1, gamma=0.1)

for clf, label in zip([clf1, clf2, clf3], ['Naive Bayes','Knn', 'Dec. Tree']):
    scores = cross_val_score(clf, data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), label))

Accuracy: 0.379 [Naive Bayes]
Accuracy: 0.488 [Knn]
Accuracy: 0.477 [Dec. Tree]


In [12]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn', clf2), ('dt', clf3)], voting='hard')
scores = cross_val_score(eclf, data, labels, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean() , "Majority Voting"))
    

Accuracy: 0.484 [Majority Voting]


In [13]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn', clf2), ('dt', clf3)],voting='soft', weights=[2,1,2])
scores = cross_val_score(eclf, data, labels, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean(), "Weighted Voting"))

Accuracy: 0.478 [Weighted Voting]


## Bagging

In [14]:
from sklearn.ensemble import BaggingClassifier

for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))
    
print()
for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=nest,max_features=0.35), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.471 [1]
Accuracy: 0.485 [2]
Accuracy: 0.505 [5]
Accuracy: 0.521 [10]
Accuracy: 0.537 [20]
Accuracy: 0.548 [50]
Accuracy: 0.549 [100]
Accuracy: 0.551 [200]

Accuracy: 0.429 [1]
Accuracy: 0.446 [2]
Accuracy: 0.479 [5]
Accuracy: 0.480 [10]
Accuracy: 0.508 [20]
Accuracy: 0.521 [50]
Accuracy: 0.536 [100]
Accuracy: 0.542 [200]


## Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(RandomForestClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.451 [1]
Accuracy: 0.470 [2]
Accuracy: 0.499 [5]
Accuracy: 0.517 [10]
Accuracy: 0.537 [20]
Accuracy: 0.545 [50]
Accuracy: 0.554 [100]
Accuracy: 0.555 [200]


In [16]:
from sklearn.ensemble import ExtraTreesClassifier

for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(ExtraTreesClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.453 [1]
Accuracy: 0.458 [2]
Accuracy: 0.479 [5]
Accuracy: 0.491 [10]
Accuracy: 0.497 [20]
Accuracy: 0.509 [50]
Accuracy: 0.509 [100]
Accuracy: 0.509 [200]


## Boosting

In [17]:
from sklearn.ensemble import AdaBoostClassifier

for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(AdaBoostClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.481 [1]
Accuracy: 0.504 [2]
Accuracy: 0.515 [5]
Accuracy: 0.521 [10]
Accuracy: 0.531 [20]
Accuracy: 0.540 [50]
Accuracy: 0.546 [100]
Accuracy: 0.547 [200]


In [18]:
from sklearn.ensemble import AdaBoostClassifier

for nest in [1,2,5,10,20,50,100]:
    scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.519 [1]
Accuracy: 0.526 [2]
Accuracy: 0.532 [5]
Accuracy: 0.530 [10]
Accuracy: 0.522 [20]
Accuracy: 0.511 [50]
Accuracy: 0.493 [100]
Accuracy: 0.489 [200]
