In [None]:

# Import necessary libraries
import os
import numpy as np 
import pandas as pd 
import nltk
import re
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix
from nltk.stem.porter import PorterStemmer
from sklearn.ensemble import RandomForestClassifier





In [None]:
df = pd.read_csv('/kaggle/input/news-aggregator-dataset/uci-news-aggregator.csv',header='infer',sep=',')

In [None]:
print('Number of Rows in df = %d'%len(df))
df.shape

In [None]:
df.head()

In [None]:
title_category_df = df.loc[:,['TITLE','CATEGORY']]

In [None]:
del df

In [None]:
gc.collect()

In [None]:
title_category_df.head()

In [None]:
# check for null values

title_category_df.isnull().sum()

In [None]:
# count unique categories


title_category_df['TITLE'].value_counts().sort_values(ascending=False).head(10)

In [None]:
# Data Cleaning

* Convert the sentence to words (word tokenizer)
* Remove Numbers & Stop words
* Convert words to vectors (bag-of-words or TFIDF)
* Apply Naive bayes & Logistic regression (Performace Req)


In [None]:
# removes numbers and )| + and whitespaces

In [None]:
stopword_english = stopwords.words('english')

In [None]:
def re_sub(s):
    
    s = s.lower()
    
    s = re.sub('\d+','',s)
    s =re.sub('\s\W',' ',s)
    s= re.sub('\W\s',' ',s)
    s = re.sub('\s+',' ',s)
  
    
    return s
    

In [None]:
title_category_df['sent'] = [re_sub(s =sen ) for sen in title_category_df['TITLE'] ]

In [None]:
stemmer = nltk.PorterStemmer()

In [None]:
title_category_df['word_tokens']  = tuple(map(lambda x : nltk.word_tokenize(x),title_category_df['sent']))
title_category_df['word_tokens_after_sw'] = title_category_df['word_tokens'].apply(lambda x: [word for word  in x if word not in stopword_english ] )
title_category_df['word_tokens_after_sw_stemmer'] = title_category_df['word_tokens_after_sw'].apply(lambda x: [stemmer.stem(word) for word in x] )

title_category_df['word_tokens_after_sw1']  = title_category_df['word_tokens_after_sw_stemmer'].apply(lambda x : ','.join(x))

In [None]:
label_encode = LabelEncoder()
label_encode.fit(title_category_df['CATEGORY'])
trans_x = dict(zip(label_encode.classes_,label_encode.transform(label_encode.classes_)))
title_category_df['label_category'] = label_encode.transform(title_category_df['CATEGORY'])

In [None]:
title_category_df.columns

In [None]:
train_columns = title_category_df[['word_tokens_after_sw1','label_category']]

In [None]:
del title_category_df
gc.collect()

In [None]:
X_train,x_test,Y_train,y_test = train_test_split(train_columns['word_tokens_after_sw1'],train_columns['label_category']
                                                ,test_size=0.2)

In [None]:
X_train.shape

In [None]:
# Vectorization 

In [None]:
vectorizer = TfidfVectorizer(max_features=8000,min_df=1  )


In [None]:
x_train_features = vectorizer.fit_transform(X_train).toarray()
x_test_features = vectorizer.transform(x_test).toarray()

In [None]:
# Applying naive bayes

naivebayes = MultinomialNB(alpha=1)
naivebayes.fit(x_train_features,Y_train)

In [None]:
y_pred = naivebayes.predict(x_test_features)

In [None]:
np.round(accuracy_score(y_test,y_pred),2)

In [None]:
lr = LogisticRegression()
lr.fit(x_train_features,Y_train)
y_pred = lr.predict(x_test_features)
np.round(accuracy_score(y_test,y_pred),2)