In [1]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import pickle
import warnings

from UnigramTfFeatureGeneration1 import create_feature_set_and_labels, create_test_feature_set_and_labels
from UnigramTfidfFeaturesetGeneration1 import get_features, get_test_features

In [2]:
def begin_test(train_x, test_x, train_y, test_y):
    x = train_x + test_x
    y = train_y + test_y
    
    clf1 = LinearRegression()
    clf2 = LogisticRegression()
    clf3 = SGDClassifier()
    clf4 = SVC()
    clf5 = KNeighborsClassifier()
    clf6 = MLPClassifier()
    clf7 = DecisionTreeClassifier()
    clf8 = MultinomialNB()
    
    eclf = VotingClassifier(
        estimators=[('logr', clf2), ('sgd', clf3), ('svm', clf4), ('kn', clf5), ('nn', clf6), ('dt', clf7)],
        voting='hard')
    
    for label, clf in zip(
        ['LogisticRegressionClassifier', 'SGDClassifierClassifier', 'SVCClassifier',
         'NearestNeighbourClassifier', 'NeuralNetworkClassifier', 'DecisionTreeClassifier',
         'MultinomialNB', 'EnsembleClassifier'],
        [clf2, clf3, clf4, clf5, clf6, clf7, clf8, eclf]):
        scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
        filename = 'zeroth3_' + label + '.sav'
        pickle.dump(clf, open(filename, 'wb'))
        f_measure = cross_val_score(clf, x, y, cv=10, scoring='f1')
        print(label, "Accuracy:  ", scores.mean(), "+/- ", scores.std())
        print(label, "F-measure:  ", f_measure.mean())
        

In [3]:
def test_by_tf():
    train_x, train_y, test_x, test_y = create_feature_set_and_labels('pos_hindi_final.txt', 'neg_hindi_final.txt', 'pos_eng_final.txt', 'neg_eng_final.txt', 'pos_hinglish.txt', 'neg_hinglish.txt')
    #test_x, test_y = create_test_feature_set_and_labels('pos_hinglish.txt', 'neg_hinglish.txt')
    begin_test(train_x, test_x, train_y, test_y)

In [4]:
def test_by_tfidf():
    train_x, train_y, test_x, test_y = get_features()
    begin_test(train_x, test_x, train_y, test_y)

In [5]:
warnings.filterwarnings('ignore')
if __name__ == '__main__':
    print("="*10)
    #print("Unigram+Tf Accuracies")
    #test_by_tf()
    print("=" * 10)
    print("Unigram+Tfidf Accuracies")
    test_by_tfidf()

Unigram+Tfidf Accuracies
LogisticRegressionClassifier Accuracy:   0.9373980857851825 +/-  0.0038284296348812586
LogisticRegressionClassifier F-measure:   0.967683615819209
SGDClassifierClassifier Accuracy:   0.9481507739572257 +/-  0.012629768802925207
SGDClassifierClassifier F-measure:   0.9731204566805287
SVCClassifier Accuracy:   0.9373980857851825 +/-  0.0038284296348812586
SVCClassifier F-measure:   0.967683615819209
NearestNeighbourClassifier Accuracy:   0.9352002835873803 +/-  0.0049470556609484415
NearestNeighbourClassifier F-measure:   0.9665084745762711
NeuralNetworkClassifier Accuracy:   0.9438733309701052 +/-  0.012428985115806044
NeuralNetworkClassifier F-measure:   0.9707598138376474
DecisionTreeClassifier Accuracy:   0.9319744771357674 +/-  0.020393170051927152
DecisionTreeClassifier F-measure:   0.9654895688629532
MultinomialNB Accuracy:   0.9373980857851825 +/-  0.0038284296348812586
MultinomialNB F-measure:   0.967683615819209
EnsembleClassifier Accuracy:   0.94817440

In [13]:
warnings.filterwarnings('ignore')
if __name__ == '__main__':
    print("="*10)
    #print("Unigram+Tf Accuracies")
    #test_by_tf()
    print("=" * 10)
    print("Unigram+Tfidf Accuracies")
    test_by_tfidf()

Unigram+Tfidf Accuracies
LogisticRegressionClassifier Accuracy:   0.8474077407740774 +/-  0.0029703333967534425
LogisticRegressionClassifier F-measure:   0.9173991548396623
SGDClassifierClassifier Accuracy:   0.9177032703270328 +/-  0.015849290739379726
SGDClassifierClassifier F-measure:   0.9547427170626752
SVCClassifier Accuracy:   0.8474077407740774 +/-  0.0029703333967534425
SVCClassifier F-measure:   0.9173991548396623
NearestNeighbourClassifier Accuracy:   0.8725103510351035 +/-  0.032349455987476274
NearestNeighbourClassifier F-measure:   0.9256420351500383
NeuralNetworkClassifier Accuracy:   0.9144931493149315 +/-  0.026663212302323713
NeuralNetworkClassifier F-measure:   0.951523742708857
DecisionTreeClassifier Accuracy:   0.9226436643664366 +/-  0.0239125374189162
DecisionTreeClassifier F-measure:   0.9531524752737999
MultinomialNB Accuracy:   0.8474077407740774 +/-  0.0029703333967534425
MultinomialNB F-measure:   0.9173991548396623
EnsembleClassifier Accuracy:   0.918593359

In [10]:
warnings.filterwarnings('ignore')
if __name__ == '__main__':
    print("="*10)
    #print("Unigram+Tf Accuracies")
    #test_by_tf()
    print("=" * 10)
    print("Unigram+Tfidf Accuracies")
    test_by_tfidf()

Unigram+Tfidf Accuracies
LogisticRegressionClassifier Accuracy:   0.9233383565595377 +/-  0.0028412112001465835
LogisticRegressionClassifier F-measure:   0.9601390861555619
SGDClassifierClassifier Accuracy:   0.9395263246894056 +/-  0.010961404331998836
SGDClassifierClassifier F-measure:   0.96995888558848
SVCClassifier Accuracy:   0.9233383565595377 +/-  0.0028412112001465835
SVCClassifier F-measure:   0.9601390861555619
NearestNeighbourClassifier Accuracy:   0.9157992897854434 +/-  0.015673692076338095
NearestNeighbourClassifier F-measure:   0.955878379696534
NeuralNetworkClassifier Accuracy:   0.9405784667721047 +/-  0.012174473654583417
NeuralNetworkClassifier F-measure:   0.9692895683693583
DecisionTreeClassifier Accuracy:   0.9286450717674792 +/-  0.025052071801220217
DecisionTreeClassifier F-measure:   0.9638965258934176
MultinomialNB Accuracy:   0.9233383565595377 +/-  0.0028412112001465835
MultinomialNB F-measure:   0.9601390861555619
EnsembleClassifier Accuracy:   0.941676862

In [5]:
# Tweeter
warnings.filterwarnings('ignore')
if __name__ == '__main__':
    #print("="*10)
    #print("Unigram+Tf Accuracies")
    #test_by_tf()
    print("=" * 10)
    print("Unigram+Tfidf Accuracies")
    test_by_tfidf()

Unigram+Tfidf Accuracies
LogisticRegressionClassifier Accuracy:   0.9276749431027131 +/-  0.004500473964707945
LogisticRegressionClassifier F-measure:   0.9624750343812236
SGDClassifierClassifier Accuracy:   0.9417477099805291 +/-  0.012724462899481773
SGDClassifierClassifier F-measure:   0.9660792621521976
SVCClassifier Accuracy:   0.9276749431027131 +/-  0.004500473964707945
SVCClassifier F-measure:   0.9624750343812236
NearestNeighbourClassifier Accuracy:   0.9277102630889447 +/-  0.012476966650351067
NearestNeighbourClassifier F-measure:   0.9623548394663446
NeuralNetworkClassifier Accuracy:   0.9363713658945075 +/-  0.013740376913347306
NeuralNetworkClassifier F-measure:   0.9660269990562252
DecisionTreeClassifier Accuracy:   0.9287857887787763 +/-  0.022043118804057668
DecisionTreeClassifier F-measure:   0.9606169284911233
MultinomialNB Accuracy:   0.9276749431027131 +/-  0.004500473964707945
MultinomialNB F-measure:   0.9624750343812236
EnsembleClassifier Accuracy:   0.940672441