# Meta Methods

In [1]:
# Import libraries 

import numpy as np    # Numeric and matrix computation
import pandas as pd   # Optional: good package for manipulating data 
import sklearn as sk  # Package with learning algorithms implemented

twigen = pd.read_csv("datapreprocessed.csv", encoding='latin1')
twigen.head()

Unnamed: 0,gender,created,description_length,fav_number,link_color,name_length,retweet_count,sidebar_color,text_length,tweet_coord,tweet_count,tweet_created,tweet_location,user_timezone
0,male,early_morning,21,0,cyan,7,0,white,109,False,110964,midday,Unknown,Chennai
1,male,midday,62,68,gray,11,0,white,139,False,7471,midday,Unknown,Eastern Time (US & Canada)
2,male,morning,35,7696,gray,14,1,white,80,False,5617,midday,India,Belgrade
3,male,evening,146,202,gray,11,0,white,138,False,1693,midday,United States,Pacific Time (US & Canada)
4,female,midday,160,37318,lightblue,12,0,black,95,False,31462,midday,Unknown,Unknown


In [2]:
def expand_categoric_column_with_binary_values(column_name):
    global twigen
    print(column_name)
    print(twigen[column_name].unique())
    twigen = pd.concat([
            twigen.drop(column_name, axis=1),
            pd.get_dummies(twigen[column_name], prefix=column_name, sparse=True)
        ], axis=1)

expand_categoric_column_with_binary_values('created')
expand_categoric_column_with_binary_values('tweet_created')
expand_categoric_column_with_binary_values('link_color')
expand_categoric_column_with_binary_values('sidebar_color')
expand_categoric_column_with_binary_values('tweet_location')
expand_categoric_column_with_binary_values('user_timezone')

twigen.info()
print(twigen.columns)
twigen.head()

created
['early_morning' 'midday' 'morning' 'evening' 'afternoon']
tweet_created
['midday']
link_color
['cyan' 'gray' 'lightblue' 'lightred' 'blue' 'black' 'violet' 'red'
 'orange' 'brown' 'green' 'lightgreen' 'white' 'yellow']
sidebar_color
['white' 'black' 'lightblue' 'orange' 'lightgreen' 'gray' 'lightred' 'cyan'
 'red' 'brown' 'blue' 'yellow' 'violet' 'green']
tweet_location
['Unknown' 'India' 'United States' 'United Kingdom' 'Canada' 'Haiti'
 'Saudi Arabia' 'Venezuela' 'Italy' 'Malaysia' 'Ireland' 'Poland' 'Spain'
 'Belgium' 'Nigeria' 'Portugal' 'South Africa' 'New Zealand' 'France'
 'Lebanon' 'Dominican Republic' 'Germany' 'Australia' 'Czechia' 'Croatia'
 'Indonesia' 'Swaziland' 'Paraguay' 'Mexico' 'Ecuador' 'South Korea'
 'Turkey' 'Syria' 'Greenland' 'Botswana' 'Brazil' 'Tunisia' 'Colombia'
 'Greece' 'Trinidad and Tobago' 'Bangladesh' 'Somalia' 'Netherlands'
 'Sweden' 'Hong Kong' 'Argentina' 'Japan' 'Egypt' 'Israel' 'Bolivia'
 'Bosnia & Herzegovina' 'Sri Lanka' 'North Africa' 'U

Unnamed: 0,gender,description_length,fav_number,name_length,retweet_count,text_length,tweet_coord,tweet_count,created_afternoon,created_early_morning,...,user_timezone_Urumqi,user_timezone_Vienna,user_timezone_Vilnius,user_timezone_Volgograd,user_timezone_Warsaw,user_timezone_Wellington,user_timezone_West Central Africa,user_timezone_Yakutsk,user_timezone_Yerevan,user_timezone_Zagreb
0,male,21,0,7,0,109,False,110964,0,1,...,0,0,0,0,0,0,0,0,0,0
1,male,62,68,11,0,139,False,7471,0,0,...,0,0,0,0,0,0,0,0,0,0
2,male,35,7696,14,1,80,False,5617,0,0,...,0,0,0,0,0,0,0,0,0,0
3,male,146,202,11,0,138,False,1693,0,0,...,0,0,0,0,0,0,0,0,0,0
4,female,160,37318,12,0,95,False,31462,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
def boolean_to_binary(column_name):
    global twigen
    twigen[column_name] = twigen[column_name].apply(lambda boolean: int(boolean))

boolean_to_binary('tweet_coord')

twigen.tweet_coord.sum()

157

In [4]:
def parse_label(gender):
    if gender == 'female': return 0
    if gender == 'male': return 1
    if gender == 'brand': return 2
    assert False # ensure gender is valid

# All values in KNN must be numeric, label included
labels = twigen.gender.apply(parse_label)
data = twigen.drop('gender', axis=1)

print(data.shape)
data.head()

(18836, 331)


Unnamed: 0,description_length,fav_number,name_length,retweet_count,text_length,tweet_coord,tweet_count,created_afternoon,created_early_morning,created_evening,...,user_timezone_Urumqi,user_timezone_Vienna,user_timezone_Vilnius,user_timezone_Volgograd,user_timezone_Warsaw,user_timezone_Wellington,user_timezone_West Central Africa,user_timezone_Yakutsk,user_timezone_Yerevan,user_timezone_Zagreb
0,21,0,7,0,109,0,110964,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,62,68,11,0,139,0,7471,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,35,7696,14,1,80,0,5617,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,146,202,11,0,138,0,1693,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,160,37318,12,0,95,0,31462,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Voting scheme

In [5]:
from sklearn.model_selection import cross_val_score
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

cv=10 # cross-validation strategy splits

#params = {'n_neighbors':list(range(1, 100 + 1, 2)), 'weights':['distance','uniform'], 'p': [1, 2]}
#knc = KNeighborsClassifier()
#clf = GridSearchCV(knc, param_grid=params,cv=cv,n_jobs=-1)  # If cv is integer, by default is Stratifyed 
#clf.fit(data, labels)
#print("Best Params fo Knn=",clf.best_params_, "Accuracy=", clf.best_score_)
#parval=clf.best_params_
parval={'n_neighbors': 63, 'weights': 'distance', 'p': 1}

In [6]:
clf1 = GaussianNB()

clf2 = KNeighborsClassifier(n_neighbors=parval['n_neighbors'],weights=parval['weights'])

clf3 = DecisionTreeClassifier(criterion='entropy', min_samples_split=101, min_impurity_split=1.3)

#clf4 = SVC(C=180, gamma=0.1)

for clf, label in zip([clf1, clf2, clf3], ['Naive Bayes','Knn', 'Dec. Tree']):
    scores = cross_val_score(clf, data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), label))

Accuracy: 0.377 [Naive Bayes]
Accuracy: 0.507 [Knn]
Accuracy: 0.525 [Dec. Tree]


In [7]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn', clf2), ('dt', clf3)], voting='hard')
scores = cross_val_score(eclf, data, labels, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean() , "Majority Voting"))

Accuracy: 0.509 [Majority Voting]


In [8]:
eclf = VotingClassifier(estimators=[('nb', clf1), ('knn', clf2), ('dt', clf3)],voting='soft', weights=[1,2,2])
scores = cross_val_score(eclf, data, labels, cv=cv, scoring='accuracy')
print("Accuracy: %0.3f [%s]" % (scores.mean(), "Weighted Voting"))

Accuracy: 0.540 [Weighted Voting]


## Bagging

In [9]:
from sklearn.ensemble import BaggingClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=101, min_impurity_split=1.3),n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))
    
print()
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy', min_samples_split=101, min_impurity_split=1.3),n_estimators=nest,max_features=0.35), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.519 [1]
Accuracy: 0.531 [2]
Accuracy: 0.538 [5]
Accuracy: 0.543 [10]
Accuracy: 0.546 [20]
Accuracy: 0.547 [50]
Accuracy: 0.547 [100]
Accuracy: 0.548 [200]

Accuracy: 0.464 [1]
Accuracy: 0.501 [2]
Accuracy: 0.516 [5]
Accuracy: 0.527 [10]
Accuracy: 0.536 [20]
Accuracy: 0.551 [50]
Accuracy: 0.554 [100]
Accuracy: 0.555 [200]


## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(RandomForestClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.465 [1]
Accuracy: 0.477 [2]
Accuracy: 0.505 [5]
Accuracy: 0.531 [10]
Accuracy: 0.543 [20]
Accuracy: 0.553 [50]
Accuracy: 0.559 [100]
Accuracy: 0.559 [200]


In [11]:
from sklearn.ensemble import ExtraTreesClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(ExtraTreesClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.461 [1]
Accuracy: 0.475 [2]
Accuracy: 0.494 [5]
Accuracy: 0.500 [10]
Accuracy: 0.513 [20]
Accuracy: 0.525 [50]
Accuracy: 0.523 [100]
Accuracy: 0.522 [200]


## Boosting

In [12]:
from sklearn.ensemble import AdaBoostClassifier

for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.481 [1]
Accuracy: 0.503 [2]
Accuracy: 0.509 [5]
Accuracy: 0.520 [10]
Accuracy: 0.530 [20]
Accuracy: 0.542 [50]
Accuracy: 0.544 [100]
Accuracy: 0.542 [200]


In [13]:
for nest in [1,2,5,10,20,50,100,200]:
    scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(max_depth=5, criterion='entropy', min_samples_split=101, min_impurity_split=1.3),n_estimators=nest), data, labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), nest))

Accuracy: 0.523 [1]
Accuracy: 0.530 [2]
Accuracy: 0.539 [5]
Accuracy: 0.541 [10]
Accuracy: 0.540 [20]
Accuracy: 0.533 [50]
Accuracy: 0.524 [100]
Accuracy: 0.512 [200]
