In [148]:
import pandas as pd

In [149]:
#adjpg = pd.read_csv('adjpg.csv', lineterminator = '\n')

user_features = pd.read_csv('user_features.csv')
post_features = pd.read_csv('post_features.csv')
snadjpg = pd.read_csv('snadjpg.csv')


In [150]:
post_features = post_features[post_features['post_id'].isin(user_features['post_id'])]

In [151]:
post_features = post_features.drop_duplicates(subset='post_id', keep="first")

In [152]:
merged = pd.merge(user_features, post_features, on = 'post_id')

In [153]:
final = pd.read_csv('posts.txt', delimiter = '\t')

In [154]:
final = final[final['post_id'].isin(merged['post_id'])]

In [155]:
final_merge = pd.merge(final, merged, on='post_id')

In [156]:
final_merge = final_merge.drop(['user_id', 'post_id', 'image_id(s)'], axis = 1)

In [157]:
final_merge=final_merge.dropna()

In [158]:
y = final_merge['label']

In [159]:
final_merge = final_merge.drop(['label', 'timestamp', 'username'], axis = 1)

In [162]:
X = final_merge.drop(['post_text'], axis = 1)

In [163]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [164]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [99]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(final_merge['post_text'])
xtrain_tfidf =  tfidf_vect.transform(xTrain['post_text'])
xvalid_tfidf =  tfidf_vect.transform(xTest['post_text'])

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(final_merge['post_text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(xTrain['post_text'])
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(xTest['post_text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(final_merge['post_text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(xTrain['post_text']) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(xTest['post_text']) 

In [102]:
final_merge['char_count'] = final_merge['post_text'].apply(len)
final_merge['word_count'] = final_merge['post_text'].apply(lambda x: len(x.split()))
final_merge['punctuation_count'] = final_merge['post_text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
final_merge['title_word_count'] = final_merge['post_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
final_merge['upper_case_word_count'] = final_merge['post_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [103]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = transblob.transBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

final_merge['noun_count'] = final_merge['post_text'].apply(lambda x: check_pos_tag(x, 'noun'))
final_merge['verb_count'] = final_merge['post_text'].apply(lambda x: check_pos_tag(x, 'verb'))
final_merge['adj_count'] = final_merge['post_text'].apply(lambda x: check_pos_tag(x, 'adj'))
final_merge['adv_count'] = final_merge['post_text'].apply(lambda x: check_pos_tag(x, 'adv'))
final_merge['pron_count'] = final_merge['post_text'].apply(lambda x: check_pos_tag(x, 'pron'))

In [106]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, yTest)

In [115]:
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, yTrain, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)
accuracy = train_model(svm.SVC(), xtrain_tfidf, yTrain, xvalid_tfidf)
print ("SVM, N-Gram Vectors: ", accuracy)
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, yTrain, xvalid_tfidf_ngram_chars)
print ("SVM, N-Gram Vectors: ", accuracy)



SVM, N-Gram Vectors:  0.5834575260804769




SVM, N-Gram Vectors:  0.5834575260804769




SVM, N-Gram Vectors:  0.5834575260804769


In [110]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, yTrain, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, yTrain, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, yTrain, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, WordLevel TF-IDF:  0.9135618479880775
NB, N-Gram Vectors:  0.8688524590163934
NB, CharLevel Vectors:  0.8602831594634873


In [112]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), yTrain, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), yTrain, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, WordLevel TF-IDF:  0.8252608047690015
Xgb, CharLevel Vectors:  0.8692250372578242


In [114]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, yTrain, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, yTrain, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, yTrain, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)



NB, WordLevel TF-IDF:  0.9131892697466468
NB, N-Gram Vectors:  0.8766766020864382
NB, CharLevel Vectors:  0.907973174366617


In [118]:
from sklearn_pandas import DataFrameMapper, cross_val_score
from sklearn.model_selection import train_test_split

In [146]:
mapper = DataFrameMapper([
     ('post_text', tfidf_vect.fit(final_merge['post_text'])),
     ('noun_count', None),
     ('verb_count', None),
     ('adj_count', None),
     ('adv_count', None),
     (' num_uppercasechars',    None),
     (' num_possentiwords',    None),
     (' num_negsentiwords',    None),
     (' num_retweets',    None),
     (' num_URLs', None),
     (' num_hashtags',   None),
     (' num_mentions', None),
    (' num_exclammark', None),
    (' contains_exclammark', None),
    (' num_questmark', None),
    (' contains_exclammark', None),
    (' num_questmark', None),
    (' contains_questmark', None),
    (' text_length', None),
    (' num_words', None),
    (' num_posts', None),
    (' is_verified', None),
    (' has_url', None),
    (' times_listed', None),
    (' folfriend_ratio', None),
    (' num_followers', None),
    (' contains_thirdorderpron', None),
    (' contains_secondorderpron', None),
    (' contains_firstorderpron', None),
    (' contains_sademo', None),
    (' contains_happyemo', None),
    (' num_posts', None),
    (' num_words', None)
    
 ])


In [165]:
accuracy = train_model(linear_model.LogisticRegression(), xTrain, yTrain, xTest)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, yTrain, xvalid_tfidf_ngram)
# print ("NB, N-Gram Vectors: ", accuracy)

# # Naive Bayes on Character Level TF IDF Vectors
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, yTrain, xvalid_tfidf_ngram_chars)
# print ("NB, CharLevel Vectors: ", accuracy)



NB, WordLevel TF-IDF:  0.6788375558867362
