In [19]:
import pandas as pd
import nltk
import ast
import pickle
import re
from nltk.corpus import words
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [20]:
flairs = ['AskIndia', 'Non-Political', 'Scheduled', 'Photography', 'Science/Technology', 'Politics', 'Business/Finance', 'Policy/Economy', 'Sports', 'Food', 'Coronavirus']
label_to_id = {'AskIndia': 0,
 'Non-Political': 1,
 'Scheduled': 2,
 'Photography': 3,
 'Science/Technology': 4,
 'Politics': 5,
 'Business/Finance': 6,
 'Policy/Economy': 7,
 'Sports': 8,
 'Food': 9,
 'Coronavirus': 10}

id_to_label = {v: k for k, v in label_to_id.items()}

In [21]:
data = pd.read_csv('reddit-india-data.csv', nrows=1100)

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REPLACE_BY_SPACE_URL = re.compile('[-_]')

cnt = 0

def clean_url(row):
    row = REPLACE_BY_SPACE_URL.sub(' ', row)
    initial = (row.split())[0]
    row = ' '.join((row.split())[1:])
    row = (re.split("[,.\-!?:/]+", row))
    initial = initial.split('/')
    initial = initial[len(initial) - 1]
    row = initial + " " + ' '.join(row)
    return row

def listtostr(listtext):
    comm_list = ast.literal_eval(listtext)
    comm_out = ' '.join(comm_list)
    return comm_out

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stop_words) # remove stopwords from text
    return text

wordlist = set(words.words())
def only_english(text):
    global wordlist
    rettext = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in wordlist or not w.isalpha())
    return rettext
    
data.fillna("",inplace = True)
data['title'] = data['title'].apply(clean_text)
data['body'] = data['body'].apply(clean_text)
data['comments'] = data['comments'].apply(listtostr)    
data['comments'] = data['comments'].apply(clean_text)
data['url'] = data['url'].apply(clean_url)

In [28]:
def printmetrics(y_test, y_pred):
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

def logisticreg(X_train, X_test, y_train, y_test):

    weights = {
    0 : 1.1,    1 : 1,    3 : 0.95,    4 : 1.05,    5 : 1.05,
        6: 1.05, 7 : 1.25,    8 : 0.95, 9: 0.95, 10 : 0.85, }
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1, max_iter = 1000, class_weight = weights)),
                 ])
    logreg.fit(X_train, y_train)
    with open('model.pickle', 'wb') as f:
        pickle.dump(logreg, f)
    y_pred = logreg.predict(X_test)
    printmetrics(y_test, y_pred)
    
feat = data['title'] + data['body'] + data['url'] + data['comments']
split_ratio = 0.3
flairclass = data.flair
flairclass = flairclass.map(label_to_id)
X_train, X_test, y_train, y_test = train_test_split(feat, flairclass, test_size=split_ratio, random_state = 42)
logisticreg(X_train, X_test, y_train, y_test)

accuracy 0.7181818181818181
                    precision    recall  f1-score   support

          AskIndia       0.71      0.67      0.69        33
     Non-Political       0.54      0.52      0.53        25
         Scheduled       1.00      1.00      1.00        31
       Photography       0.86      0.86      0.86        35
Science/Technology       0.61      0.61      0.61        28
          Politics       0.71      0.78      0.74        37
  Business/Finance       0.68      0.68      0.68        28
    Policy/Economy       0.53      0.50      0.52        34
            Sports       0.89      0.86      0.88        29
              Food       0.74      0.74      0.74        27
       Coronavirus       0.56      0.61      0.58        23

          accuracy                           0.72       330
         macro avg       0.71      0.71      0.71       330
      weighted avg       0.72      0.72      0.72       330



This file mostly has code from Model. This has been made to compile the final model and extract it with Pickle so it can be directly loaded into the web application.