In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cv = CountVectorizer()
cv.fit(['Great Subhash Chandra Bose is the main reason why British left India'])

In [3]:
cv.vocabulary_

{'great': 3,
 'subhash': 9,
 'chandra': 2,
 'bose': 0,
 'is': 5,
 'the': 10,
 'main': 7,
 'reason': 8,
 'why': 11,
 'british': 1,
 'left': 6,
 'india': 4}

# Use of n-gram

In [4]:
cv = CountVectorizer(ngram_range=(2,2))
cv.fit(['Great Subhash Chandra Bose is the main reason why British left India'])
cv.vocabulary_

{'great subhash': 3,
 'subhash chandra': 8,
 'chandra bose': 2,
 'bose is': 0,
 'is the': 4,
 'the main': 9,
 'main reason': 6,
 'reason why': 7,
 'why british': 10,
 'british left': 1,
 'left india': 5}

In [5]:
cv = CountVectorizer(ngram_range=(3,3))
cv.fit(['Great Subhash Chandra Bose is the main reason why British left India'])
cv.vocabulary_

{'great subhash chandra': 3,
 'subhash chandra bose': 7,
 'chandra bose is': 2,
 'bose is the': 0,
 'is the main': 4,
 'the main reason': 8,
 'main reason why': 5,
 'reason why british': 6,
 'why british left': 9,
 'british left india': 1}

In [6]:
cv = CountVectorizer(ngram_range=(1,3))
cv.fit(['Great Subhash Chandra Bose is the main reason why British left India'])
cv.vocabulary_

{'great': 9,
 'subhash': 24,
 'chandra': 6,
 'bose': 0,
 'is': 13,
 'the': 27,
 'main': 18,
 'reason': 21,
 'why': 30,
 'british': 3,
 'left': 16,
 'india': 12,
 'great subhash': 10,
 'subhash chandra': 25,
 'chandra bose': 7,
 'bose is': 1,
 'is the': 14,
 'the main': 28,
 'main reason': 19,
 'reason why': 22,
 'why british': 31,
 'british left': 4,
 'left india': 17,
 'great subhash chandra': 11,
 'subhash chandra bose': 26,
 'chandra bose is': 8,
 'bose is the': 2,
 'is the main': 15,
 'the main reason': 29,
 'main reason why': 20,
 'reason why british': 23,
 'why british left': 32,
 'british left india': 5}

# Cleaning Text:

In [7]:
import spacy 
nlp = spacy.load("en_core_web_sm")

In [8]:
def cleanText(text):
    doc = nlp(text)

    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [9]:
cleanText('Great Subhash Chandra Bose is the main reason why British left India')

'Great Subhash Chandra Bose main reason British leave India'

In [10]:
cleaned_text = cleanText('Great Subhash Chandra Bose is the main reason why British left India')

In [11]:
#  BOW n-gram of cleaned_text

In [12]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit([cleaned_text])
cv.vocabulary_

{'great': 6,
 'subhash': 15,
 'chandra': 4,
 'bose': 0,
 'main': 11,
 'reason': 13,
 'british': 2,
 'leave': 9,
 'india': 8,
 'great subhash': 7,
 'subhash chandra': 16,
 'chandra bose': 5,
 'bose main': 1,
 'main reason': 12,
 'reason british': 14,
 'british leave': 3,
 'leave india': 10}

In [13]:
# Performing on actual dataset

# News classification

In [14]:
import pandas as pd

df = pd.read_csv('news-article-categories.csv')
print(df.shape)
df.head()

(6877, 3)


Unnamed: 0,category,title,body
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,Actor Jeff Hiller Talks “Bright Colors And Bol...,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,There’s something about combining the traditio...


#Data Cleaning

In [15]:
df.drop(columns=["title"],inplace=True)


In [16]:
df.head()

Unnamed: 0,category,body
0,ARTS & CULTURE,"In October 2017, Carolyn Kramer received a dis..."
1,ARTS & CULTURE,This week I talked with actor Jeff Hiller abou...
2,ARTS & CULTURE,The New Yorker is taking on President Donald T...
3,ARTS & CULTURE,"Kellen Hickey, a 26-year-old who lives in Huds..."
4,ARTS & CULTURE,There’s something about combining the traditio...


In [17]:
df.isnull().sum()

category    0
body        5
dtype: int64

In [18]:
df.dropna(inplace=True)

In [19]:
df.isnull().sum()

category    0
body        0
dtype: int64

# Balancing data

In [20]:
x1 = df["category"]
y1 = df["body"]

In [21]:
df["category"].value_counts()

category
ARTS & CULTURE    1001
BUSINESS           501
ENTERTAINMENT      501
ENVIRONMENT        501
POLITICS           501
RELIGION           501
SPORTS             501
TECH               501
WOMEN              501
EDUCATION          490
COMEDY             376
SCIENCE            350
MEDIA              347
CRIME              300
Name: count, dtype: int64

In [22]:
min_samples = min(df["category"].value_counts())
min_samples

300

In [23]:
#seperating all unique values as datasets:
df_ARTS_and_CULTURE = df[df.category=="ARTS & CULTURE"].sample(min_samples,random_state=2022)
df_BUSINESS = df[df.category=="BUSINESS"].sample(min_samples,random_state=2022)
df_ENTERTAINMENT = df[df.category=="ENTERTAINMENT"].sample(min_samples,random_state=2022)
df_ENVIRONMENT = df[df.category=="ENVIRONMENT"].sample(min_samples,random_state=2022)
df_POLITICS = df[df.category=="POLITICS"].sample(min_samples,random_state=2022)
df_RELIGION = df[df.category=="RELIGION"].sample(min_samples,random_state=2022)
df_SPORTS = df[df.category=="SPORTS"].sample(min_samples,random_state=2022)
df_TECH = df[df.category=="TECH"].sample(min_samples,random_state=2022)
df_WOMEN = df[df.category=="WOMEN"].sample(min_samples,random_state=2022)
df_EDUCATION = df[df.category=="EDUCATION"].sample(min_samples,random_state=2022)
df_COMEDY = df[df.category=="COMEDY"].sample(min_samples,random_state=2022)
df_SCIENCE = df[df.category=="SCIENCE"].sample(min_samples,random_state=2022)
df_MEDIA = df[df.category=="MEDIA"].sample(min_samples,random_state=2022)
df_CRIME = df[df.category=="CRIME"].sample(min_samples,random_state=2022)

In [24]:
df_balanced = pd.concat([df_ARTS_and_CULTURE,df_BUSINESS,df_ENTERTAINMENT,df_ENVIRONMENT,df_POLITICS,df_RELIGION,df_SPORTS,df_TECH,df_WOMEN,df_EDUCATION,df_COMEDY,df_SCIENCE,df_MEDIA,df_CRIME],axis=0)
df_balanced["category"].value_counts()

category
ARTS & CULTURE    300
BUSINESS          300
ENTERTAINMENT     300
ENVIRONMENT       300
POLITICS          300
RELIGION          300
SPORTS            300
TECH              300
WOMEN             300
EDUCATION         300
COMEDY            300
SCIENCE           300
MEDIA             300
CRIME             300
Name: count, dtype: int64

# Encoding:

In [25]:
labels = {
    'ARTS & CULTURE':0,
    'BUSINESS':1,
    'BUSINESS':2,
    'ENTERTAINMENT':3,
    'ENVIRONMENT':4,       
    'POLITICS':5,          
    'RELIGION':6,          
    'SPORTS':7,            
    'TECH':8,              
    'WOMEN':9,           
    'EDUCATION':10,         
    'COMEDY':11,            
    'SCIENCE':12,           
    'MEDIA':13,
    'CRIME':14
}

In [26]:
df_balanced['category_num'] = df_balanced.category.map(labels)

In [44]:
df_balanced.sample(7)

Unnamed: 0,category,body,category_num
4555,RELIGION,Nearly three months after his inauguration as ...,6
5207,SCIENCE,Planting new emotions in unwitting people’s mi...,12
3368,ENVIRONMENT,By Environment Correspondent Alister Doyle OSL...,4
2073,CRIME,A Connecticut man is facing an attempted murde...,14
1079,BUSINESS,LONDON (Reuters) - Bitcoin tumbled 18 percent ...,2
4178,POLITICS,WASHINGTON (Reuters) - U.S. Justice Department...,5
6493,WOMEN,LOS ANGELES ― Gloria Steinem sees your worries...,9


In [28]:
# splitting

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train,x_test,y_train,y_test = train_test_split(df_balanced.body,df_balanced.category_num,test_size=0.2,random_state=42,stratify=df_balanced.category_num)

# Model building

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,1))),
    ('multiNB',MultinomialNB())
])

clf.fit(x_train,y_train)

In [32]:
# Evaluation

In [33]:
y_pred = clf.predict(x_test)

In [34]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.67      0.72        60
           2       0.65      0.68      0.67        60
           3       0.62      0.77      0.69        60
           4       0.92      0.77      0.84        60
           5       0.65      0.73      0.69        60
           6       0.75      0.97      0.85        60
           7       0.94      0.80      0.86        60
           8       0.88      0.73      0.80        60
           9       0.67      0.65      0.66        60
          10       0.65      0.92      0.76        60
          11       1.00      0.15      0.26        60
          12       0.91      0.80      0.85        60
          13       0.61      0.73      0.67        60
          14       0.71      0.93      0.81        60

    accuracy                           0.74       840
   macro avg       0.77      0.74      0.72       840
weighted avg       0.77      0.74      0.72       840



In [35]:
y_test[:10]

4638     6
5033    12
3875    13
6205     8
6812     9
1267     2
5133    12
6292     8
2618    10
4427     5
Name: category_num, dtype: int64

In [36]:
y_pred[:10]

array([ 6, 12, 13,  8,  3,  2, 12,  8, 10,  5], dtype=int64)

In [91]:
# it predicted 9 correctly out of 10

In [None]:

# labels = {
#     'ARTS & CULTURE':0,
#     'BUSINESS':1,
#     'BUSINESS':2,
#     'ENTERTAINMENT':3,
#     'ENVIRONMENT':4,       
#     'POLITICS':5,          
#     'RELIGION':6,          
#     'SPORTS':7,            
#     'TECH':8,              
#     'WOMEN':9,           
#     'EDUCATION':10,         
#     'COMEDY':11,            
#     'SCIENCE':12,           
#     'MEDIA':13,
#     'CRIME':14
# }

In [92]:
key_list = list(labels.keys())
val_list = list(labels.values())
def category_name(news_id):
    category_name = val_list.index(news_id)
    print(key_list[category_name])

In [46]:
# lets try on a current Science news:

In [47]:
news = ['''
        The Helios mission’s study of the sun provided key insights into the solar wind, the sun’s magnetic field, galactic rays and more (SN: 12/21/74). Its success helped pave the way for NASA’s Parker Solar Probe, which has been observing our star’s outer atmosphere, or corona, since 2018. Parker has dipped within 5 million miles of the sun’s surface and will eventually swing within 4 million miles. Such encounters have pinpointed a layer that separates the corona from interstellar space and found evidence that snappy magnetic field lines may accelerate the solar wind (SN: 12/15/21; SN: 06/09/23). The probe will next encounter the sun in September.
''']

In [93]:
news_count = cv.transform(news)

news_id = clf.predict(news)
category_name(news_id)

SCIENCE


In [94]:
# one more test with a crime news:
news2 = [
    '''
    PORTLAND, Maine — Portland police stopped two teens after they allegedly made an illegal U-turn just after midnight Monday, which resulted in two arrests for drug trafficking.

During the traffic stop, the officer discovered one of the occupants of the vehicle was in violation of their bail conditions, Portland police spokesperson Brad Nadeau said Monday in a news release.

Cole Swan, 18, of Yarmouth was arrested and charged with aggravated trafficking of scheduled drugs, unlawful possession of cocaine, and violating conditions of release and was taken to Cumberland County Jail, Nadeau said. 

A 16-year-old juvenile from Yarmouth was also arrested and charged with aggravated trafficking of scheduled drugs and unlawful possession of cocaine. Police said he was taken to Long Creek Youth Development Center.

Police took into evidence 62 grams of cocaine base, 8.17 grams of cocaine HCL, $1,550 in cash, a digital working scale, and a loaded handgun, according to the release.

Investigators are urging anyone who has any information that might assist in this or any other case to call them at 207-874-8575. You may also text the keyword PPDME and your message to 847411.
    '''
]

In [95]:
news2_count = cv.transform(news2)

news_id = clf.predict(news2)[0]
category_name(news_id)


CRIME
