In [93]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
def create_dataset(file_path):
    file = open(file_path) # opening the dataset
    # these store the different data items in each line
    y = []
    X = []
    while True:
        tweet = file.readline().rstrip() # removing excess whitespace, endline chars at the end of each line
        split = tweet.split(' ') # splitting into list of words at space
        y.append(split[-1]) # last work of each sentence is the target
        words = split[:-1] #
        if words:
            sentence = ' '.join(words[2:])
        X.append(sentence)
        
        if not tweet:
            break
    return X,y

In [101]:
X_list,y_list = create_dataset("waseemDataSet.txt")

In [102]:
def convert_to_dataframe(X):
    df = pd.DataFrame(X, columns=["tweet_text"])
    return df

In [103]:
X = convert_to_dataframe(X_list)

In [104]:
tweet_text = X["tweet_text"]

In [138]:
X_train, X_test, y_train, y_test = train_test_split(tweet_text, y_list, test_size=0.33, random_state=42)

In [139]:
vectorizer = CountVectorizer(ngram_range=(1, 3), token_pattern = r"(?u)\b\w+\b", analyzer='char')

In [140]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)

In [141]:
X_train_counts = vectorizer.fit_transform(X_train)

In [142]:
X_train_scaled = scaler.fit_transform(X_train_counts)

In [143]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_scaled, y_train)


In [144]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier().fit(X_train_scaled, y_train)

In [145]:
X_test_counts = vectorizer.transform(X_test)
X_test_scaled = scaler.transform(X_test_counts)

In [186]:
y_pred = clf.predict(X_test_counts)

In [151]:
y_pred_2 = clf2.predict(X_test_scaled)

In [187]:
from sklearn.metrics import accuracy_score
print("Naive Bayes", accuracy_score(y_test, y_pred))
print("Random Forest", accuracy_score(y_test, y_pred_2))
print("Logistic Regression", accuracy_score(y_test, y_pred_3))

Naive Bayes 0.6860779020439646
Random Forest 0.8177786347859622
Logistic Regression 0.8069803316621674


In [169]:
from sklearn.linear_model import LogisticRegression
clf3 = LogisticRegression(solver='liblinear').fit(X_train_scaled, y_train)

In [184]:
y_pred_3 = clf3.predict(X_test_scaled)
print("Logistic Regression", accuracy_score(y_test, y_pred_3))

Logistic Regression 0.8069803316621674


#### Scaling doesn't matter, naives bayes is a bad idea
#### For logistic regression increasing iterations isn't really doing a lot

### Hyper param search below

In [171]:
# finding the hyper params
print(clf2.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [172]:
rf = RandomForestClassifier(criterion='entropy').fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
print(accuracy_score(y_pred_rf, y_test))

0.8156575395295025


In [173]:
rf = RandomForestClassifier(n_jobs=3)

In [174]:
# ranges for hyperparams
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
min_samples_split = [2, 3, 4, 5]
criterion = ['gini', 'entropy', 'log_loss']
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap,
               'criterion': criterion}

In [175]:
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model - takes one hour
rf_random.fit(X_train_counts, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


90170.58s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.63s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.63s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.63s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.66s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.66s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.67s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
90170.67s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=800; total time=  51.4s
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=800; total time=  51.6s
[CV] END bootstrap=True, criterion=gini, min_samples_split=4, n_estimators=1000; total time= 1.2min
[CV] END bootstrap=True, criterion=gini, min_samples_split=4, n_estimators=1000; total time= 1.2min
[CV] END bootstrap=True, criterion=gini, min_samples_split=4, n_estimators=1000; total time= 1.2min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=800; total time=  53.8s
[CV] END bootstrap=True, criterion=gini, min_samples_split=2, n_estimators=1400; total time= 2.0min
[CV] END bootstrap=True, criterion=gini, min_samples_split=2, n_estimators=1400; total time= 2.1min
[CV] END bootstrap=True, criterion=gini, min_samples_split=2, n_estimators=1400; total time= 2.1min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=4, n_estimators=1000; total 

90778.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1200; total time= 1.5min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=4, n_estimators=1200; total time= 1.8min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=2, n_estimators=1800; total time= 3.2min
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1200; total time= 1.4min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=4, n_estimators=1200; total time= 1.8min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=4, n_estimators=1200; total time= 1.7min


90794.56s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=entropy, min_samples_split=2, n_estimators=1800; total time= 3.6min


90804.71s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=gini, min_samples_split=5, n_estimators=600; total time=  58.5s
[CV] END bootstrap=False, criterion=gini, min_samples_split=5, n_estimators=600; total time=  58.3s
[CV] END bootstrap=False, criterion=gini, min_samples_split=5, n_estimators=600; total time=  56.8s
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1200; total time= 1.4min
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=1200; total time= 2.4min
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=1200; total time= 2.6min
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=1200; total time= 2.6min
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=2000; total time= 2.4min
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=2000; total time= 2.4min
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=1600; total time= 3.5

91373.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=log_loss, min_samples_split=4, n_estimators=1800; total time= 2.5min
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=4, n_estimators=1800; total time= 2.5min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=5, n_estimators=600; total time=  50.3s
[CV] END bootstrap=False, criterion=entropy, min_samples_split=5, n_estimators=600; total time=  50.5s
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1800; total time= 2.1min
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1800; total time= 2.1min
[CV] END bootstrap=True, criterion=gini, min_samples_split=3, n_estimators=1800; total time= 2.1min
[CV] END bootstrap=True, criterion=entropy, min_samples_split=4, n_estimators=2000; total time= 1.9min
[CV] END bootstrap=True, criterion=entropy, min_samples_split=4, n_estimators=2000; total time= 1.9min
[CV] END bootstrap=False, criterion=entropy, min_samples_split=5, n_estimators

92197.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=log_loss, min_samples_split=2, n_estimators=1600; total time= 3.0min
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=2, n_estimators=1600; total time= 3.1min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=5, n_estimators=1200; total time= 1.2min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=5, n_estimators=1200; total time= 1.2min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=5, n_estimators=1200; total time= 1.2min
[CV] END bootstrap=True, criterion=entropy, min_samples_split=5, n_estimators=800; total time=  52.1s
[CV] END bootstrap=True, criterion=entropy, min_samples_split=5, n_estimators=800; total time=  52.2s
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=2, n_estimators=2000; total time= 4.0min


92356.15s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=True, criterion=entropy, min_samples_split=5, n_estimators=800; total time=  50.5s
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=2, n_estimators=2000; total time= 4.2min


92383.59s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=log_loss, min_samples_split=2, n_estimators=2000; total time= 3.7min


92391.65s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


[CV] END bootstrap=False, criterion=log_loss, min_samples_split=3, n_estimators=2000; total time= 3.3min
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=3, n_estimators=2000; total time= 3.2min
[CV] END bootstrap=False, criterion=log_loss, min_samples_split=3, n_estimators=2000; total time= 3.2min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=1200; total time= 1.3min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=1200; total time= 1.3min
[CV] END bootstrap=True, criterion=log_loss, min_samples_split=3, n_estimators=1200; total time= 1.3min
[CV] END bootstrap=False, criterion=gini, min_samples_split=3, n_estimators=1400; total time= 2.6min
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=200; total time=  28.2s
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimators=200; total time=  29.1s
[CV] END bootstrap=False, criterion=gini, min_samples_split=2, n_estimat

In [176]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 2,
 'criterion': 'gini',
 'bootstrap': False}

In [177]:
rf_best = RandomForestClassifier(n_estimators=1600, min_samples_split=2, criterion='gini', bootstrap=False).fit(X_train_scaled, y_train)

In [189]:
y_pred_rf = rf_best.predict(X_test_scaled)
print("Best hyperparameters accuracy: ",accuracy_score(y_pred_rf, y_test))

Best hyperparameters accuracy:  0.8297338989587351
