In [11]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('Twitter_data.csv')
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SAHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SAHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SAHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SAHIL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df_shortened = df.head(100)

In [8]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return tokens, filtered_tokens, stemmed_tokens, lemmatized_tokens

# Apply preprocessing function to each row in DataFrame
preprocessed_data = df_shortened["clean_text"].apply(preprocess_text)

# Display preprocessed data
print(preprocessed_data)

0     ([when, modi, promised, “, minimum, government...
1     ([talk, all, the, nonsense, and, continue, all...
2     ([what, did, just, say, vote, for, modi, welco...
3     ([asking, his, supporters, prefix, chowkidar, ...
4     ([answer, who, among, these, the, most, powerf...
                            ...                        
95    ([country, prospers, when, the, women, the, co...
96    ([sabbash, mera, vote, for, peppermit, abvp], ...
97    ([yogi, adityanath, hold, 100, rallies, seek, ...
98    ([from, the, very, beginningmodi, doing, wada,...
99    ([modi, politics, hate, modiji, loves, india, ...
Name: clean_text, Length: 100, dtype: object


In [9]:
print(preprocessed_data[0])

(['when', 'modi', 'promised', '“', 'minimum', 'government', 'maximum', 'governance', '”', 'expected', 'him', 'begin', 'the', 'difficult', 'job', 'reforming', 'the', 'state', 'why', 'does', 'take', 'years', 'get', 'justice', 'state', 'should', 'and', 'not', 'business', 'and', 'should', 'exit', 'psus', 'and', 'temples'], ['modi', 'promised', '“', 'minimum', 'government', 'maximum', 'governance', '”', 'expected', 'begin', 'difficult', 'job', 'reforming', 'state', 'take', 'years', 'get', 'justice', 'state', 'business', 'exit', 'psus', 'temples'], ['modi', 'promis', '“', 'minimum', 'govern', 'maximum', 'govern', '”', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ'], ['modi', 'promised', '“', 'minimum', 'government', 'maximum', 'governance', '”', 'expected', 'begin', 'difficult', 'job', 'reforming', 'state', 'take', 'year', 'get', 'justice', 'state', 'business', 'exit', 'psus', 'temple'])


In [13]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_shortened['clean_text'])

In [14]:
print(X)

  (0, 821)	0.19627128356761353
  (0, 677)	0.1801027365696738
  (0, 261)	0.1801027365696738
  (0, 107)	0.19627128356761353
  (0, 587)	0.10865356225628482
  (0, 37)	0.24303974579568058
  (0, 758)	0.31946555278393696
  (0, 446)	0.1801027365696738
  (0, 321)	0.1686309699098889
  (0, 957)	0.14099065625216428
  (0, 811)	0.15246242291194917
  (0, 213)	0.15973277639196848
  (0, 933)	0.14099065625216428
  (0, 789)	0.3602054731393476
  (0, 695)	0.19627128356761353
  (0, 437)	0.15973277639196848
  (0, 204)	0.19627128356761353
  (0, 830)	0.14973250463543986
  (0, 77)	0.19627128356761353
  (0, 371)	0.14099065625216428
  (0, 263)	0.19627128356761353
  (0, 334)	0.15973277639196848
  (0, 526)	0.19627128356761353
  (0, 335)	0.15246242291194917
  (0, 535)	0.19627128356761353
  :	:
  (99, 355)	0.14356509181295507
  (99, 113)	0.14356509181295507
  (99, 199)	0.14356509181295507
  (99, 240)	0.14356509181295507
  (99, 356)	0.14356509181295507
  (99, 503)	0.14356509181295507
  (99, 545)	0.5742603672518203
  (