In [1]:
#import libraries
import pandas as pd

#Read data
data=pd.read_csv('https://confrecordings.ams3.digitaloceanspaces.com/News.csv')

#Count Unique values in Category column
print(data['Category'].value_counts())

#Create a dictionary d
d={'sport':0 , 'business':1 ,'tech':2 ,'entertainment':3, 'politics':4}

#Map dictionary to Category column
data['CategoryId'] = data['Category'].map(d)
print(data.head())



sport            191
business         187
tech             152
entertainment    140
politics         129
Name: Category, dtype: int64
   ArticleId                                               Text  Category  \
0       1833  worldcom ex-boss launches defence lawyers defe...  business   
1        154  german business confidence slides german busin...  business   
2       1101  bbc poll indicates economic gloom citizens in ...  business   
3       1976  lifestyle  governs mobile choice  faster  bett...      tech   
4        917  enron bosses in $168m payout eighteen former e...  business   

   CategoryId  
0           1  
1           1  
2           1  
3           2  
4           1  


In [5]:
import nltk
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('popular', quiet=True)
nltk.download('wordnet')
from nltk.corpus import stopwords
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from nltk.stem import WordNetLemmatizer

#Define a function for cleaning of text

def clean_text(text):
    text=text.lower()
    wordnet_lemmatizer = WordNetLemmatizer()
    text=' '.join(re.sub('[^a-zA-Z]'," ",text).split())
    text = nltk.word_tokenize(text)
    text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in text if not word in stopwords.words('english')])
    return text
data['Text'] = data['Text'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

#store text column in x
x = np.array(data.iloc[:,1].values)

#store CategoryID column in y variable
y = np.array(data.CategoryId.values)

#Extract features using CountVectorizer
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(data.Text).toarray()

#print shape of x and y
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

#Split data into training and testing dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)

#print shape of x_train , x_test, y_train , y_test
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

X.shape =  (799, 5000)
y.shape =  (799,)
(559, 5000)
(240, 5000)
(559,)
(240,)


In [7]:
#Train model with SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train,y_train)

#Predict for x_test value
y_pred = svm.predict(x_test)

#Calculate accuracy of model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)*100
print(accuracy)

#import library
from sklearn.metrics import confusion_matrix

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

89.16666666666667
Confusion matrix

 [[56  0  2  0  0]
 [ 1 47  3  0  1]
 [ 0  4 44  0  2]
 [ 2  1  2 39  0]
 [ 0  3  5  0 28]]


In [9]:
#Predict for new value
new_pred = cv.transform(['Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin']).toarray()

yy = svm.predict(new_pred)

#Return result array in form of text string again
result = ""
if yy == [0]:
      result = "Sports News"
elif yy == [1]:
      result = "Business News"
elif yy == [2]:
      result = "Tech News"
elif yy == [3]:
      result = "Entertainment News"
elif yy == [4]:
      result = "Politics News"
print(result)

Sports News
