In [1]:
# importing dataset
import pandas as pd
import nltk
nltk.download(['punkt', 'stopwords', 'wordnet'])    # downloading for word tokenization
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# evaluating model using metrices
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
df = pd.read_csv("/content/drive/MyDrive/Deep Learning/Data.csv", encoding = "ISO-8859-1")
df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar's debut double,"Southgate strikes, Leeds pay the penalty",Hammers hand Robson a youthful lesson,Saints party like it's 1999,Wear wolves have turned into lambs,Stump mike catches testy Gough's taunt,Langer escapes to hit 167,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,Hopkins 'furious' at Foster's lack of Hannibal...,Has Cubie killed fees?,A tale of two tails,I say what I like and I like what I say,"Elbows, Eyes and Nipples",Task force to assess risk of asteroid collision,How I found myself at last,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite


In [3]:
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']
y_train = train['Label']
y_test = test['Label']

In [4]:
data = train.iloc[:, 2:].copy()
# to remove the unwanted variable like (comma colen etc)
data.replace("[^a-zA-Z0-9]", " ", regex=True, inplace=True)
data.head(2)   # everything is cleaned

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,A hindrance to operations extracts from the...,Scorecard,Hughes instant hit buoys Blues,Jack gets his skates on at ice cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar s debut double,Southgate strikes Leeds pay the penalty,Hammers hand Robson a youthful lesson,Saints party like it s 1999,Wear wolves have turned into lambs,Stump mike catches testy Gough s taunt,Langer escapes to hit 167,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl s successor drawn into scandal,The difference between men and women,Sara Denver nurse turned solicitor,Diana s landmine crusade put Tories in a panic,Yeltsin s resignation caught opposition flat f...,Russian roulette,Sold out,Recovering a title
1,Scorecard,The best lake scene,Leader German sleaze inquiry,Cheerio boyo,The main recommendations,Has Cubie killed fees,Has Cubie killed fees,Has Cubie killed fees,Hopkins furious at Foster s lack of Hannibal...,Has Cubie killed fees,A tale of two tails,I say what I like and I like what I say,Elbows Eyes and Nipples,Task force to assess risk of asteroid collision,How I found myself at last,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man s extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn t know without the ...,Millennium bug fails to bite


In [5]:
# now convert text data into lowercase
for feature in data.columns:
  data[feature] = data[feature].str.lower()
data.head(1)

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,hammers hand robson a youthful lesson,saints party like it s 1999,wear wolves have turned into lambs,stump mike catches testy gough s taunt,langer escapes to hit 167,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title


In [6]:
X_data = []
for i in range(len(data.index)):
  X_data.append(" ".join(str(feature) for feature in data.iloc[i, 0:]))

In [7]:
# converts para into individual words 
stemming = PorterStemmer()
X = []
for i in range(len(X_data)):
  words = nltk.word_tokenize(X_data[i])
  word = [stemming.stem(word) for word in words if word not in set(stopwords.words('english'))]
  word = " ".join(word)
  X.append(word)

In [8]:
# Convert text into vectors using TF-IDF
vector = CountVectorizer(max_features=5000, ngram_range=(2,2))
X_train = vector.fit_transform(X)

In [9]:
X_train.shape

(3975, 5000)

In [28]:
randomclassifier=RandomForestClassifier(n_estimators=300,criterion='entropy', )
rfc = randomclassifier.fit(X_train, y_train)

In [11]:
## Predict for the Test Dataset
data = test.iloc[:, 2:].copy()

data.replace("[^a-zA-Z0-9]", " ", regex=True, inplace=True)

for feature in data.columns:
  data[feature] = data[feature].str.lower()

X_test = []
for i in range(len(data.index)):
  X_test.append(' '.join(str(x) for x in data.iloc[i,0:]))

stemming = PorterStemmer()
y = []
for i in range(len(X_test)):
  words = nltk.word_tokenize(X_test[i])
  word = [stemming.stem(word) for word in words if word not in set(stopwords.words('english'))]
  word = " ".join(word)
  y.append(word)

X_test = vector.transform(y)
print(X_test.shape)

In [29]:
# prediction
pred = randomclassifier.predict(X_test)

In [30]:
print(f"Confusion matrix :\n {confusion_matrix(y_test, pred)}")
print(f"Accuracy score : {accuracy_score(y_test, pred)}")
print(f"Classification report :\n {classification_report(y_test, pred)}")

Confusion matrix :
 [[156  30]
 [ 28 164]]
Accuracy score : 0.8465608465608465
Classification report :
               precision    recall  f1-score   support

           0       0.85      0.84      0.84       186
           1       0.85      0.85      0.85       192

    accuracy                           0.85       378
   macro avg       0.85      0.85      0.85       378
weighted avg       0.85      0.85      0.85       378



In [31]:
naive_baiye = MultinomialNB(alpha = 0.8)
model = naive_baiye.fit(X_train, y_train)

In [32]:
pred = naive_baiye.predict(X_test)

In [33]:
print(f"Confusion matrix :\n {confusion_matrix(y_test, pred)}")
print(f"Accuracy score : {accuracy_score(y_test, pred)}")
print(f"Classification report :\n {classification_report(y_test, pred)}")

Confusion matrix :
 [[156  30]
 [ 39 153]]
Accuracy score : 0.8174603174603174
Classification report :
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       186
           1       0.84      0.80      0.82       192

    accuracy                           0.82       378
   macro avg       0.82      0.82      0.82       378
weighted avg       0.82      0.82      0.82       378

