In [None]:
'''This task involves creating a Reddit Flair Detector. I have first 
preprocessed the data extracted from r/india.
After preprocessing, I tested the data with 3 classifier, Naive Bayees,
Support Vector Machine and Logistic Regression. The accuracy of Logistic
Regression was the highest, so I made my model using that. I uploaded my dataset 
on mongodb using atlas and studio 3T. I used that database test the models. 
I saved my final model, logistic regression in .sav file, to be able to use it
while predicting.'''

In [15]:
import pandas as pd
import nltk
import random
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from numpy import random
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
import pickle
import pymongo

In [2]:
'''Removing the stopwords'''
def remove_stopwords(text):
	for w in text:
		if w == "\n":
			w= " "
	words = [w for w in text if w not in stopwords.words("english")]
	return words

In [3]:
'''Stemming using Porter Stemmer. It involves converting all words to their root word'''
ps= PorterStemmer()

def stemming(text):
	lis = []
	for t in text:
		t = str(t)
		aux_l = ""
		words = []
		words = word_tokenize(t)
		for w in words:
			rootword = ps.stem(w)
			if(rootword.isalnum()):
				aux_l = aux_l+ rootword+ " "
		lis.append(str(aux_l))
	return lis

In [4]:
'''Reading the database to preprocess, convert items to use and preprocess in a list'''
df = pd.read_csv('testset.csv')
df.fillna("")
title= df["title"].tolist()
body= df["body"].tolist()
comment = df["comments"].tolist()
#print("-----------------")
#print(comment)


In [5]:
'''Removing stopwords from title, body and comments'''
t_remove = remove_stopwords(title)
b_remove = remove_stopwords(body)
c_remove = remove_stopwords(comment)


In [6]:
'''Converting to dataframe and storing in a csv file'''
df["title"] = t_remove
df["body"] = b_remove
df["comments"] = c_remove
df.to_csv("test_stopwords.csv")

In [7]:
'''Reading csv file with remove stopwords and applying stemming on them.'''
df = pd.read_csv('test_stopwords.csv')
df.fillna("")
title= df["title"].tolist()
body= df["body"].tolist()
comment = df["comments"].tolist()
t_remove_stem= stemming(title)
b_remove_stem = stemming(body)
c_remove_stem =  stemming(comment)
#c_remove_stem = stemming(c_remove)
#print(t_remove_stem)
#print("---------------------------------")
#print(b_remove_stem)

'for c in comment:\n\tval = stemming(c)\n\tprint(val)\n\tc_remove_stem.append(val)'

In [22]:
'''Converting stemmed data to data frame'''
t1 = pd.DataFrame(t_remove_stem)
b1 = pd.DataFrame(b_remove_stem)
c1 = pd.DataFrame(c_remove_stem)

In [23]:
'''Saving the data into a csv file'''
df["title"] = t1
df["body"] = b1
df["comments"] = c1
df.to_csv("preprocessed.csv")

In [8]:
'''declaring list of target flairs for model testing'''
flairs = ["Scheduled", "Politics", "Photography","Policy/Economy",
         "Sports","Non-Political","Science/Technology","Food",
            "Business/Finance","Coronavirus","Megathread","CAA-NRC","[R]eddiquette"]

In [9]:
#print(df.head())
#y = df["flair"]
#x = df.drop("flair", axis = 1)
#print(x)
#print(y)
#x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
#print(x_train)
#print(x_train.shape, x_test.shape)
#print(x_test)
#train = x_train.to_csv("train.csv")
#test = x_test.to_csv("test.csv")

In [None]:
'''Testing the data along 3 classifiers: Naive Bayees,
Support Vector Machine and Logistic Regression. Used sklearn library for the same'''

In [10]:
'''Naive Bayees Classfier'''
def nb_classifier(X_train, X_test, y_train, y_test):
	from sklearn.naive_bayes import MultinomialNB
	nb = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
	nb.fit(X_train, y_train)
	y_pred = nb.predict(X_test)
	new_l = []
	"""for i in flairs:
					if(i in y_test):
						new_l.append(i)"""
	print("Results of Naive Bayes Classifier")
	print('accuracy ' + str(accuracy_score(y_pred, y_test)))
	print("Classification Report is: ")
	print(classification_report(y_test, y_pred,target_names=flairs))

In [11]:
'''SVM Classifier'''
def svm_classifier(X_train,X_test,y_train,y_test):
	from sklearn.linear_model import SGDClassifier
	svm = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier()),])
	svm.fit(X_train, y_train)
	y_pred = svm.predict(X_test)
	new_l = []
	"""for i in flairs:
					if(i in y_test):
						new_l.append(i)"""

	print("Results of Support Vector Machine Classifier")
	print('accuracy ' + str(accuracy_score(y_pred, y_test)))
	print("Classification Report is: ")
	print(classification_report(y_test, y_pred,target_names=flairs))

In [12]:
'''Logistic Regression Classifier'''
def logreg_classifier(X_train,X_test,y_train,y_test):
	from sklearn.linear_model import LogisticRegression
	lgr = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression()),])
	lgr.fit(X_train, y_train)
	new_l = []
	filename = 'finalized_model.sav'
	pickle.dump(lgr, open(filename, 'wb'))
	y_pred = lgr.predict(X_test)
	
	print("Results of Logistic Regression")
	print('accuracy ' + str(accuracy_score(y_pred, y_test)))
	print("Classification Report is: ")
	print(classification_report(y_test, y_pred,target_names=flairs))

In [13]:
'''Dividing the data into training and test dataset'''
def train_test(X,y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
	#nb_classifier(X_train, X_test, y_train, y_test)
	#svm_classifier(X_train, X_test, y_train, y_test)
	logreg_classifier(X_train, X_test, y_train, y_test)


In [16]:
'''Reading the data from MongoDB. Used pymongo to do the same.'''
#df = pd.read_csv('preprocessed.csv')
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['Midaas']
preprocess = db.preprocessed
df = pd.DataFrame(list(preprocess.find()))
df["flair"] = df["flair"].values.astype('U')
df["comments"] = df["comments"].values.astype('U')
df["title"] = df["title"].values.astype('U')
df["body"] = df["body"].values.astype('U')
fl = df["flair"].tolist()
com = df["comments"].tolist()
tit = df["title"].tolist()
bod = df["body"].tolist()
combined = (df["comments"]+ df["title"]+df["body"]).tolist()
#df = df.assign(combine = combined)
#com = df.combine

In [20]:
'''Printing results of the final model for title, body,comments and combined'''
print("Flair Detection using Title as Feature")
train_test(tit,fl)
print("Flair Detection using Body as Feature")
train_test(bod,fl)
print("Flair Detection using Comments as Feature")
train_test(com,fl)
print("Flair Detection using Combined Features")
train_test(combined,fl) 

Flair Detection using Title as Feature
Results of Logistic Regression
accuracy 0.5308219178082192
Classification Report is: 
                    precision    recall  f1-score   support

         Scheduled       0.52      0.41      0.45        74
          Politics       0.64      0.76      0.69        70
       Photography       0.40      0.59      0.48        64
    Policy/Economy       0.61      0.54      0.57        79
            Sports       0.00      0.00      0.00         2
     Non-Political       0.26      0.28      0.27        68
Science/Technology       0.77      0.63      0.70        76
              Food       0.42      0.48      0.45        71
  Business/Finance       0.36      0.40      0.38        73
       Coronavirus       0.88      0.96      0.92        85
        Megathread       0.31      0.45      0.37        66
           CAA-NRC       0.86      0.64      0.73        69
     [R]eddiquette       0.48      0.19      0.27        79

          accuracy               

  _warn_prf(average, modifier, msg_start, len(result))


Results of Logistic Regression
accuracy 0.2237442922374429
Classification Report is: 
                    precision    recall  f1-score   support

         Scheduled       0.67      0.02      0.04        87
          Politics       0.00      0.00      0.00        80
       Photography       0.00      0.00      0.00        77
    Policy/Economy       0.78      0.23      0.36        78
            Sports       0.00      0.00      0.00         1
     Non-Political       0.00      0.00      0.00        69
Science/Technology       0.10      0.97      0.18        61
              Food       0.31      0.14      0.19        71
  Business/Finance       0.00      0.00      0.00        83
       Coronavirus       0.95      0.95      0.95        81
        Megathread       0.32      0.17      0.22        71
           CAA-NRC       0.00      0.00      0.00        60
     [R]eddiquette       0.17      0.32      0.22        57

          accuracy                           0.22       876
         mac

  _warn_prf(average, modifier, msg_start, len(result))


Results of Logistic Regression
accuracy 0.5582191780821918
Classification Report is: 
                    precision    recall  f1-score   support

         Scheduled       0.56      0.40      0.47        78
          Politics       0.66      0.66      0.66        76
       Photography       0.32      0.48      0.39        69
    Policy/Economy       0.70      0.81      0.75        68
            Sports       0.00      0.00      0.00         1
     Non-Political       0.33      0.21      0.26        75
Science/Technology       0.59      0.83      0.69        69
              Food       0.36      0.47      0.41        62
  Business/Finance       0.55      0.69      0.61        93
       Coronavirus       0.96      0.92      0.94        79
        Megathread       0.46      0.30      0.36        71
           CAA-NRC       0.84      0.47      0.60        68
     [R]eddiquette       0.45      0.42      0.43        67

          accuracy                           0.56       876
         mac

  _warn_prf(average, modifier, msg_start, len(result))


Results of Logistic Regression
accuracy 0.6438356164383562
Classification Report is: 
                    precision    recall  f1-score   support

         Scheduled       0.59      0.43      0.50        82
          Politics       0.70      0.70      0.70        60
       Photography       0.54      0.70      0.61        73
    Policy/Economy       0.77      0.81      0.79        63
            Sports       0.00      0.00      0.00         2
     Non-Political       0.45      0.33      0.38        64
Science/Technology       0.86      0.84      0.85        85
              Food       0.46      0.58      0.52        72
  Business/Finance       0.53      0.65      0.58        77
       Coronavirus       0.92      0.92      0.92        75
        Megathread       0.52      0.58      0.55        74
           CAA-NRC       0.93      0.76      0.83        83
     [R]eddiquette       0.48      0.39      0.43        66

          accuracy                           0.64       876
         mac

  _warn_prf(average, modifier, msg_start, len(result))
