<a href="https://colab.research.google.com/github/sanskriti17204/datascience_tutorial/blob/main/stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [43]:
news=pd.read_csv('/content/news.csv')

In [44]:
news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [45]:
news.shape

(21417, 4)

In [46]:
news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [47]:
#filling the missing values with null string
news=news.fillna(' ')

In [48]:
#merging the title and subject
news['content']=news['title'] + ' ' + news['date']

In [49]:
news.head()

Unnamed: 0,title,text,subject,date,content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017","As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Trump wants Postal Service to charge 'much mor...


In [50]:
#sepearting feature and target
X=news.drop(columns='subject', axis=1)
Y=news['subject']

**Stemming**

stemming is the process of reducing a word to its root word

In [51]:
port_stem= PorterStemmer()

In [52]:
def stemming(content):
  stemmedcontent=re.sub('[^a-zA-Z]',' ',content)
  stemmedcontent=stemmedcontent.lower()
  stemmedcontent=stemmedcontent.split()
  stemmedcontent=[port_stem.stem(word) for word in stemmedcontent if not word in stopwords.words('english')]
  steamed_content=' '.join(stemmedcontent)
  return stemmedcontent

In [53]:
news['content']= news['content'].apply(stemming)

In [54]:
print(news['content'])

0        [u, budget, fight, loom, republican, flip, fis...
1        [u, militari, accept, transgend, recruit, mond...
2        [senior, u, republican, senat, let, mr, muelle...
3        [fbi, russia, probe, help, australian, diploma...
4        [trump, want, postal, servic, charg, much, ama...
                               ...                        
21412    [fulli, commit, nato, back, new, u, approach, ...
21413    [lexisnexi, withdrew, two, product, chines, ma...
21414          [minsk, cultur, hub, becom, author, august]
21415    [vatican, upbeat, possibl, pope, franci, visit...
21416    [indonesia, buy, billion, worth, russian, jet,...
Name: content, Length: 21417, dtype: object


In [55]:
X=news['content'].values
y=news['subject'].values

In [57]:
print(X)

[list(['u', 'budget', 'fight', 'loom', 'republican', 'flip', 'fiscal', 'script', 'decemb'])
 list(['u', 'militari', 'accept', 'transgend', 'recruit', 'monday', 'pentagon', 'decemb'])
 list(['senior', 'u', 'republican', 'senat', 'let', 'mr', 'mueller', 'job', 'decemb'])
 ... list(['minsk', 'cultur', 'hub', 'becom', 'author', 'august'])
 list(['vatican', 'upbeat', 'possibl', 'pope', 'franci', 'visit', 'russia', 'august'])
 list(['indonesia', 'buy', 'billion', 'worth', 'russian', 'jet', 'august'])]


In [58]:
print(Y)

0        politicsNews
1        politicsNews
2        politicsNews
3        politicsNews
4        politicsNews
             ...     
21412       worldnews
21413       worldnews
21414       worldnews
21415       worldnews
21416       worldnews
Name: subject, Length: 21417, dtype: object


In [67]:
#converting the textual data to feature vectors
vectorizer=TfidfVectorizer()
vectorizer.fit(y)


Y=vectorizer.transform(Y)

In [68]:
print(vectorizer.vocabulary_)

{'politicsnews': 0, 'worldnews': 1}


In [83]:
#converting the textual data to feature vectors
#vectorizer=TfidfVectorizer()
#vectorizer.fit(X)
#X=vectorizer.transform(X)
#print(X)