In [None]:
#This model helps to predict different News classes.
#First import all the libraries required to clean text and build model.
import pandas as pd
from sklearn.svm import SVC
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.svm import LinearSVC
from joblib import dump, load

In [None]:
#import training data
train= pd.read_excel('/content/train.csv')
train.head()

Unnamed: 0,text,class
0,? ? ? said as a result of its december acquisi...,3
1,? generale de banque sa lt ? br and lt heller ...,4
2,? shr 3 28 dlrs vs 22 cts shr diluted 2 99 dlr...,3
3,? the farmers home administration the u s agri...,4
4,? seton co said its board has received a propo...,4


In [None]:
# Creating a Method Clean Data to clean the dataset to br able to feed the model
#start by importing useful libraries for text cleaning and lemmatization
def clean_data(dataset):
  #'en_core_web_sm'=English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
  nlp = spacy.load('en_core_web_sm')
  #the methos converts all the text into lower characters, removes any punctuation marks and removes stopwords
  # Create a record object
  stopwords=spacy.lang.en.stop_words
  corpus = []
  for i in range(0,len(dataset['text'])):
      record = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
      record = record.lower()
      record = nlp(record)
      record = " ".join([token.lemma_ for token in record if not (nlp.vocab[token.text].is_stop  or token.text.startswith(' '))])
      corpus.append(record)
  return corpus

In [None]:
# Create a record object for train Data
train_corpus=clean_data(train)


In [None]:
#Import the test data 
#Similarly create the same for test data.
test= pd.read_excel('/content/test.csv')
test_corpus=clean_data(test)

In [None]:
#We use 'TfidfVectorizer' to convert the raw features
Feature_extracter = TfidfVectorizer()
# Creating the Bag of Words model
# we create X,y to feed the model
X = Feature_extracter.fit_transform(train_corpus).toarray()
y = train.iloc[:, 1].values


In [None]:
#the corpus after converting through TfidVectorizer
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#For this model we will be using Linear Support Vector Machine Algorithm
model=LinearSVC()
model.fit(X,y)
y_pred = model.predict(X)

In [None]:
#After predicting the result we can see various metrics to evaluate model performance
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        55
           1       0.94      0.97      0.95       432
           2       0.97      0.99      0.98        74
           3       0.97      0.98      0.97      3159
           4       0.97      0.96      0.96      1949
           5       0.90      0.53      0.67        17
           6       1.00      1.00      1.00        48
           7       0.94      1.00      0.97        16
           8       0.91      0.90      0.91       139
           9       0.95      0.97      0.96       101
          10       0.93      0.97      0.95       124
          11       0.95      0.95      0.95       390
          12       0.98      0.92      0.95        49
          13       0.96      0.97      0.96       172
          14       0.92      0.92      0.92        26
          15       1.00      1.00      1.00        20
          16       0.93      0.93      0.93       444
          17       1.00    

In [None]:
#Now to predict the values for test data we first transform the test_corpus
X_test = Feature_extracter.transform(test_corpus).toarray()
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#then we feed transformed data to the model
y_test_pred = model.predict(X_test)


In [None]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#we create a dataSet with id and predicted value
ids = list(test['id'])

dataSet = []

for i,pred in enumerate(y_test_pred):
    dataSet.append([ids[i],y_test_pred[i]])

In [None]:
import requests
dataString = ""
for loop in range(len(dataSet)):
  if loop == 0:
    dataString = str(int(dataSet[loop][0]))+','+str(int(dataSet[loop][1]))
  else:
    dataString = dataString+"\n"+str(int(dataSet[loop][0]))+','+str(int(dataSet[loop][1]))

postData = {}
postData['challengeName'] = 'newsclassification'
postData['userID'] = '919409'
postData['challengeType'] = 'multiclassification'
postData['submissionsData'] = dataString

url = 'https://8n46gxwibi.execute-api.us-east-2.amazonaws.com/default/computeModelScore'
x = requests.post(url,json=postData)

print(x.text)


{"message":"Internal Server Error"}


In [None]:
#save the model
dump(model, 'Multi_class_News_Classifier.joblib') 

['Multi_class_News_Classifier.joblib']