In [1]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from bs4 import BeautifulSoup
import pickle
import praw
import re

In [2]:
#Declarations
flairs = ['AskIndia', 'Business/Finance', 'CAA-NRC-NPR', 'Coronavirus', 'Food', 
          'Non-Political', 'Photography', 'Policy/Economy', 'Politics',
          'Scheduled', 'Science/Technology', 'Sports']

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
res = []

In [3]:
#Removing unwanted objections on text
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [4]:
#Naive Bayes Classifier
def nb_classifier(X_train, X_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB


  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  #print(classification_report(y_test, y_pred,target_names=flairs))

In [5]:
#Linear Support Vector Machine
def linear_svm(X_train, X_test, y_train, y_test):
  
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  #print(classification_report(y_test, y_pred,target_names=flairs))

In [6]:
#Logistic Regression
def logisticreg(X_train, X_test, y_train, y_test):
  logreg = []
  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  #print(classification_report(y_test, y_pred,target_names=flairs))

In [7]:
#Random Forest
def randomforest(X_train, X_test, y_train, y_test):
  
  from sklearn.ensemble import RandomForestClassifier
  
  randfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  randfor.fit(X_train, y_train)

  y_pred = randfor.predict(X_test)
  pickle.dump(randfor, open('LR.pkl', 'wb'))
  print('accuracy %s' % accuracy_score(y_pred, y_test))
  #print(classification_report(y_test, y_pred,target_names=flairs))

In [8]:
#MLP Classifier
def mlpclassifier(X_train, X_test, y_train, y_test):
  
  from sklearn.neural_network import MLPClassifier
  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  #print(classification_report(y_test, y_pred,target_names=flairs))

In [9]:
#Train Test Data
def train_test(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

  print("Results of Naive Bayes Classifier")
  nb_classifier(X_train, X_test, y_train, y_test)
  print("Results of Linear Support Vector Machine")
  linear_svm(X_train, X_test, y_train, y_test)
  print("Results of Logistic Regression")
  logisticreg(X_train, X_test, y_train, y_test)
  print("Results of Random Forest")
  randomforest(X_train, X_test, y_train, y_test)
  print("Results of MLP Classifier")
  mlpclassifier(X_train, X_test, y_train, y_test)

In [10]:
#Filling appropriate format for training purposes
data = pd.read_csv('infom.csv')
data['Flair'] = data['Flair'].fillna('Nothing')
data['Title'] = data['Title'].apply(str)
data['Title'] = data['Title'].apply(clean_text)
data['Urls'] = data['Urls'].apply(str)
data['Urls'] = data['Urls'].apply(clean_text)
data['Comments'] = data['Comments'].apply(str)
data['Comments'] = data['Comments'].apply(clean_text)
combine = data['Title'] + data['Urls'] + data['Comments']
data = data.assign(combine = combine)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

In [11]:
#Deploying for testing the Models on
data_fill = data.Flair
W = data.Title
X = data.Urls
Y = data.Comments
Z = data.combine
print("Flair Detection using Title as Feature")
train_test(W,data_fill)
print("Flair Detection using Urls as Feature")
train_test(X,data_fill)
print("Flair Detection using Comments as Feature")
train_test(Y,data_fill)
print("Flair Detection using Combined Features")
train_test(Z,data_fill)

Flair Detection using Title as Feature
Results of Naive Bayes Classifier
accuracy 0.5
Results of Linear Support Vector Machine
accuracy 0.55
Results of Logistic Regression
accuracy 0.5
Results of Random Forest
accuracy 0.6
Results of MLP Classifier




accuracy 0.6
Flair Detection using Urls as Feature
Results of Naive Bayes Classifier
accuracy 0.45
Results of Linear Support Vector Machine
accuracy 0.4
Results of Logistic Regression
accuracy 0.5
Results of Random Forest
accuracy 0.45
Results of MLP Classifier




accuracy 0.5
Flair Detection using Comments as Feature
Results of Naive Bayes Classifier
accuracy 0.45
Results of Linear Support Vector Machine
accuracy 0.35
Results of Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy 0.5
Results of Random Forest
accuracy 0.35
Results of MLP Classifier
accuracy 0.4
Flair Detection using Combined Features


