In [49]:
import os
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from imblearn.over_sampling import SMOTE


In [38]:
df = pd.read_csv('C:/Users/Hetal/Documents/GitHub/Text-classification-and-summarization/Data collection/news_data.csv', index_col=0)

with open('../dataset.csv', 'a+', encoding="utf-8-sig", newline='') as f:
    df.to_csv(f, header=False, index = False)


In [83]:
#Encoding the target labels
df = pd.read_csv("../dataset.csv")
label_encoder = preprocessing.LabelEncoder() 

df['category']= label_encoder.fit_transform(df['category']) 

print(df['category'].value_counts())


3    969
0    796
1    620
4    586
2    552
5    312
Name: category, dtype: int64


In [84]:
#Upsampling

df0 = df[df.category==0]
df1 = df[df.category==1]
df2 = df[df.category==2]
df3 = df[df.category==3]
df4 = df[df.category==4]
df5 = df[df.category==5]


df0 = resample(df0, 
                   replace=True,    # sample without replacement
                   n_samples=969,     # to match majority class
                   random_state=123) # reproducible results
df1 = resample(df1, 
                   replace=True,    # sample without replacement
                   n_samples=969,     # to match majority class
                   random_state=123) # reproducible results
df2 = resample(df2, 
                   replace=True,    # sample without replacement
                   n_samples=969,     # to match majority class
                   random_state=123) # reproducible results
df4 = resample(df4, 
                   replace=True,    # sample without replacement
                   n_samples=969,     # to match majority class
                   random_state=123) # reproducible results
df5 = resample(df5, 
                   replace=True,    # sample without replacement
                   n_samples=969,     # to match majority class
                   random_state=123) # reproducible results

df_upsampled = pd.concat([df0,df1,df2,df3,df4,df5])

 
#Display new class counts
df=df_upsampled
df_upsampled.category.value_counts()

3    969
2    969
5    969
1    969
4    969
0    969
Name: category, dtype: int64

In [59]:
#lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

df["text_lemmatized"] = df["news"].apply(lemmatize_text)

df.head()

Unnamed: 0,category,news,text_lemmatized,filtered_text
2271,0,"Actress Rachel Bloom and her husband, writer-p...","Actress Rachel Bloom and her husband , writer-...","Actress Rachel Bloom husband , writer-producer..."
365,0,WMC says Xstrata bid is too low\n\nAustralian ...,WMC say Xstrata bid is too low Australian mini...,WMC say Xstrata bid low Australian mining firm...
382,0,Fed warns of more US rate rises\n\nThe US look...,Fed warns of more US rate rise The US look set...,Fed warns US rate rise The US look set continu...
322,0,No seasonal lift for house market\n\nA swathe ...,No seasonal lift for house market A swathe of ...,No seasonal lift house market A swathe figure ...
98,0,"GM, Ford cut output as sales fall\n\nUS car fi...","GM , Ford cut output a sale fall US car firm G...","GM , Ford cut output sale fall US car firm Gen..."


In [60]:
#Removing stop words
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in nltk.word_tokenize(text) if word not in stopword_list]
    filtered_text = ' '.join(filtered_words)    
    return filtered_text

df["filtered_text"] = df["text_lemmatized"].apply(remove_stopwords)

df.head()

Unnamed: 0,category,news,text_lemmatized,filtered_text
2271,0,"Actress Rachel Bloom and her husband, writer-p...","Actress Rachel Bloom and her husband , writer-...","Actress Rachel Bloom husband , writer-producer..."
365,0,WMC says Xstrata bid is too low\n\nAustralian ...,WMC say Xstrata bid is too low Australian mini...,WMC say Xstrata bid low Australian mining firm...
382,0,Fed warns of more US rate rises\n\nThe US look...,Fed warns of more US rate rise The US look set...,Fed warns US rate rise The US look set continu...
322,0,No seasonal lift for house market\n\nA swathe ...,No seasonal lift for house market A swathe of ...,No seasonal lift house market A swathe figure ...
98,0,"GM, Ford cut output as sales fall\n\nUS car fi...","GM , Ford cut output a sale fall US car firm G...","GM , Ford cut output sale fall US car firm Gen..."


In [72]:
training_set, test_set, training_labels, test_labels = train_test_split(df["filtered_text"], df["category"], test_size=0.33, random_state=24)


In [79]:
#Bag of words
vectorizer = CountVectorizer(min_df=2)
bow_train_features = vectorizer.fit_transform(training_set)

bow_test_features = vectorizer.transform(test_set)

In [80]:
mnb = MultinomialNB()
mnb.fit(bow_train_features,training_labels)

predictions = mnb.predict(bow_test_features)

score = mnb.score(bow_test_features, test_labels)
print("Bag of words accuracy ",score)

Bag of words accuracy  0.938509640437728


In [81]:
#Tfidf 
tfidvectorizer = TfidfVectorizer(min_df=6, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True)
tfid_train_features = tfidvectorizer.fit_transform(training_set)

tfid_test_features = tfidvectorizer.transform(test_set)

In [82]:
mnb = MultinomialNB()

mnb.fit(tfid_train_features,training_labels)

predictions = mnb.predict(tfid_test_features)

score = mnb.score(tfid_test_features, test_labels)
print("Tfidf accuracy ",score)

Tfidf accuracy  0.9338196977592496
