Importing necessary packages

In [1]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Setting up settings of pandas to show number of rows, columns

In [2]:
pd.set_option('display.max_rows', 700)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Importing the dataset

In [3]:
data=pd.read_csv('Articles.csv',encoding='ISO-8859-1')

In [4]:
print('Number of rows: ',data.shape[0])
print('Number of columns: ',data.shape[1])

Number of rows:  2692
Number of columns:  4


In [5]:
# Printing columns in the data sets
print(data.columns)

Index(['Article', 'Date', 'Heading', 'NewsType'], dtype='object')


In [6]:
print(data['NewsType'].value_counts())

sports      1408
business    1284
Name: NewsType, dtype: int64


The dataset contains 1408 sports articles and 1284 business articles

In [7]:
# Printing some info related to the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2692 entries, 0 to 2691
Data columns (total 4 columns):
Article     2692 non-null object
Date        2692 non-null object
Heading     2692 non-null object
NewsType    2692 non-null object
dtypes: object(4)
memory usage: 84.2+ KB
None


In [8]:
# removing the date columns since it is not a useful feature to classify articles
df=data.drop(columns=['Date'])

In [9]:
df.head()

Unnamed: 0,Article,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,us oil prices slip below 50 a barr,business


In [10]:
'''This cell collects words in a row of two columns, heading and article, creates a word list
and stores it in 'words' list variable'''
words=list()
arr=[]
for i,row in df.iterrows():
    temp=row['Heading']+' '+row['Article']
    words.append(temp.split(' '))
    

In [11]:
print('\'words\' variable contains for example: ')
print(words[0])

'words' variable contains for example: 
['sindh', 'govt', 'decides', 'to', 'cut', 'public', 'transport', 'fares', 'by', '7pc', 'kti', 'rej', 'KARACHI:', 'The', 'Sindh', 'government', 'has', 'decided', 'to', 'bring', 'down', 'public', 'transport', 'fares', 'by', '7', 'per', 'cent', 'due', 'to', 'massive', 'reduction', 'in', 'petroleum', 'product', 'prices', 'by', 'the', 'federal', 'government,', 'Geo', 'News', 'reported.Sources', 'said', 'reduction', 'in', 'fares', 'will', 'be', 'applicable', 'on', 'public', 'transport,', 'rickshaw,', 'taxi', 'and', 'other', 'means', 'of', 'traveling.Meanwhile,', 'Karachi', 'Transport', 'Ittehad', '(KTI)', 'has', 'refused', 'to', 'abide', 'by', 'the', 'government', 'decision.KTI', 'President', 'Irshad', 'Bukhari', 'said', 'the', 'commuters', 'are', 'charged', 'the', 'lowest', 'fares', 'in', 'Karachi', 'as', 'compare', 'to', 'other', 'parts', 'of', 'the', 'country,', 'adding', 'that', '80pc', 'vehicles', 'run', 'on', 'Compressed', 'Natural', 'Gas', '(CNG

In [12]:
# doing some necessary cleaning in the 'words' list
for i in range(len(words)):
    for j in range(len(words[i])):
        words[i][j]=words[i][j].replace(':','')
        if not words[i][j].isalpha():
            words[i][j]=''

In [13]:
# counting words and storing it in a dictionary format: 'word':'occurence number'
words_dict=Counter()
for i in range(len(words)):
    words_dict+=Counter(words[i])

In [14]:
type(Counter(words[0]))

collections.Counter

In [15]:
del words_dict['']
#deleting dictionary key where key is ''

In [16]:
len(words_dict)
# words_dict contains 25494 key-value pairs

25494

In [17]:
# Taking out most common 3500 words out of 25494
# we will use these 3500 words to train our model
words_dict=words_dict.most_common(3500)

In [18]:
# feature engineering
features=[]
for i in range(len(words)):
    t=words[i]
    data=[]
    for i in words_dict:
        data.append(t.count(i[0]))
    features.append(data)

In [19]:
# Dependent variable 'x'
x=np.array(features)

In [20]:
x[0]

array([5, 5, 4, ..., 0, 0, 0])

In [21]:
x.shape

(2692, 3500)

In [22]:
# Since we need to predict the class of the article, NewsType will be our target variable
df['NewsType']=df['NewsType'].replace({'sports':0,'business':1})

target=df['NewsType'].iloc[:].values

In [23]:
target.shape

(2692,)

In [24]:
# Target Variable
y=np.array(target)

In [25]:
classifier=MultinomialNB()

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=7)

In [27]:
classifier.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
y_pred=classifier.predict(x_test)

In [29]:
accuracy_score(y_test,y_pred)*100

99.25788497217069

In [30]:
def checker(heading,body):
    """Pass a heading, and body of the article to use this function.
        Returns whether article is sports or business."""
    temp=heading+' '+body
    t=temp.split(' ')
    data=[]
    for i in words_dict:
        data.append(t.count(i[0]))
    ans=classifier.predict(np.array(data).reshape(1,3500))[0]
    if ans==0:
        return 'sports'
    if ans==1:
        return 'business'
    

In [31]:
heading="""How To Get Working Capital Loans In India"""
body="""The business owners are aware of the fact that sometimes the problem in dealing with the negative cashflow can kill the business and that’s why a small funding can be helpful in resolving short term problems. The working capital loans can help your business to function properly and efficiently. This type of business loan can be one of the simplest methods for protecting the credibility of a company.

In this article, we’ve explained how to get working capital loans in India. Read more to learn about eligibility criteria to get business loan, when you should consider this type of loan and which sites can help you get it."""

In [32]:
heading1="""Assigning higher risk weights on unrated cos improved repayments: RBI study"""
body1="""MUMBAI: A policy shift to insist on higher risk weights on large loans which are unrated has had the desired impact with a shift in borrower behaviour, a Reserve Bank study has found.

The RBI had introduced the move in 2016 with a view to nudge banks to have rated exposures.

"In aggregate terms, the change in policy resulted in a 50 percent decline in the treated borrowers' likelihood of switching from rated to unrated categories over quarters," an RBI study by Pallavi Chavan a .."""

In [33]:
heading2="""Why Pakistan are all but out of the World Cup"""
body2="""NEW DELHI: The similarities between Pakistan's World Cup campaigns in 1992 and 2019 have ended. England's thumping victory against New Zealand on Wednesday has ensured that Pakistan have been virtually knocked out of the World Cup semifinal race. Pakistan's semifinal chances are now reduced to just a mathematical possibility, and that too a near impossible one. Sarfaraz Ahmed and his team need an improbable win over Bangladesh to pip New Zealand on net run rate. The road to the knockout stage for Sarfaraz Ahmed and his men is just an improbable mathematical calculation and that too provided Pakistan win the toss and bat first.

Currently placed fifth with nine points from eight matches, Pakistan face the challenge of beating Bangladesh by 311 runs if they score 350 or by 316 runs, if they score 400. Just a win for Pakistan will mean they will be tied with New Zealand on 11 points and 5 wins each, but the Kiwis will still be ahead on Net run rate. Pakistan's current Net run rate is -0.79.

If Pakistan lose the toss and are asked to field, their minuscule semifinal hopes, will be over even before the first ball is bowled at Lord's.

New Zealand finished their league stage engagements at 11 points from nine matches, after their 119-run loss to England. But, despite the massive defeat, the Kiwis are way ahead of Pakistan, in terms of their net run-rate, which is +0.17, compared to Pakistan's -0.79. """

In [34]:
ans=checker(heading1,body1)

In [35]:
ans

'business'