# Data cleaning 

In [None]:
import pandas as pd
import numpy as np

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore')

In [None]:
#importing the training dataset
data = pd.read_csv('NEP_2020_english_tweet.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Author_ID,Date_of_tweet,Tweet,Likes_on_tweet,User_handle,Tweet_link
0,0,60483175,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,venkattcv,https://twitter.com/venkattcv/status/129079263...
1,1,1178900491595636736,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,BethEYoung3,https://twitter.com/BethEYoung3/status/1290792...
2,2,1072481505090318341,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,PatelPrerak18,https://twitter.com/PatelPrerak18/status/12907...
3,3,583518901,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1,kchongpacrim,https://twitter.com/kchongpacrim/status/129077...
4,4,36339871,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0,SkyscapeInc,https://twitter.com/SkyscapeInc/status/1290754...


In [None]:
data.shape

(18240, 7)

In [None]:
print('Total tweets in the data are', len(data))

Total tweets in the data are 18240


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18240 entries, 0 to 18239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      18240 non-null  int64 
 1   Author_ID       18240 non-null  int64 
 2   Date_of_tweet   18240 non-null  object
 3   Tweet           18240 non-null  object
 4   Likes_on_tweet  18240 non-null  int64 
 5   User_handle     18240 non-null  object
 6   Tweet_link      18240 non-null  object
dtypes: int64(3), object(4)
memory usage: 997.6+ KB


The training dataset consists the data of 18240 tweets related to 7 features.

* Unnamed: 0 (a blank column that had no data)
* Author_ID (ID of the twitter user)
* Date_of_tweet (the data the tweet was done)
* Tweet (The text of the tweet)
* Likes_on_tweet (Number of likes the tweet received)
* User_handle (The username of the twitter account) 
* Tweet_link (Link of the tweet)

### Dropping the unecessary columns

In [None]:
# dropping the Unnamed: 0 column

del data['Unnamed: 0']

**Unnamed: 0 column which is an empty useless column has been deleted**

In [None]:
df1=data.copy() # creating a copy of the dataset and storing it as df1

In [None]:
df1.drop(columns=["Author_ID","User_handle","Tweet_link"],inplace=True) 

As we wish to maintain anonymity and analyze the aggregate sentiments, the columns **"Author_ID", "User_handle", "Tweet_link" have been removed**. 

### Checking for duplicate values

In [None]:
# Checking for duplicate values and deleting them if any

df1.duplicated().sum()

3606

**There are 3606 duplicated values in the dataset**

In [None]:
# checking for no. of retweets

df1['Tweet'].duplicated().sum()

11382

There are 11382 retweeted tweets in the dataset

In [None]:
df1.drop_duplicates('Tweet',keep='first',inplace = True) #//deleting all the retweets//
df1.duplicated().sum() #//re-checking for duplicated values after dropping them//

0

**Only the original tweets have been retained and all the retweets have been dropped**

### Checking for missing values

In [None]:
# Checking for missing or null values

np.sum(df1.isnull().any(axis=1))

0

**There are no missing values in the dataset**

In [None]:
df1.head()

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1
3,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1
4,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0


In [None]:
df1.shape

(6858, 3)

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6858 entries, 0 to 18239
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date_of_tweet   6858 non-null   object
 1   Tweet           6858 non-null   object
 2   Likes_on_tweet  6858 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 214.3+ KB


The cleaned dataset consists of 6858 tweets related to 3 features (Date_of_tweet, Tweet and Likes_on_tweet)

# Data Pre-processing

### Converting Date_of_tweet dtype to datetime

In [None]:
df1['Date_of_tweet'] = pd.to_datetime(df1['Date_of_tweet'])

The Date feature has been converted to datetime data type for further analysis

In [None]:
#!pip install nltk     #//installing the nltk library//

In [None]:
import nltk
import re

In [None]:
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

### Removing blank spaces before and after the tweet text

In [None]:
df1['Tweet'] = df1['Tweet'].map(str.strip)

### Removing punctuations

In [None]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df1['Tweet_punct'] = df1['Tweet'].apply(lambda x: remove_punct(x))
df1.head(3)

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...


In [None]:
# Example of the processed tweet

print("Before removing punctuations --> ",df1.iloc[0,1])
print("After removing punctuations --> ",df1.iloc[0,3])

Before removing punctuations -->  Historically #TamilNadu has been a state with progressive #socialpolicies &amp; high state/private investments in #education. Not without pitfalls but one of the best in India. If states were not involved in framing #NEP2020 as alleged here,I'm very surprised.
After removing punctuations -->  Historically TamilNadu has been a state with progressive socialpolicies amp high stateprivate investments in education Not without pitfalls but one of the best in India If states were not involved in framing NEP as alleged hereIm very surprised


### Conversion to lower case and Tokenization(Converting a sentence into list of words)

In [None]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

df1['Tweet_tokenized'] = df1['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
df1.head(3)

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"[historically, tamilnadu, has, been, a, state,..."
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"[hellostrangerep, harrisonford, georgefloyd, j..."
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"[cbseindia, pmoindia, narendramodi, drrpnishan..."


In [None]:
# Example of the processed tweet

print("Before tokenization --> ",df1.iloc[0,3])
print("After tokenization --> ",df1.iloc[0,4])

Before tokenization -->  Historically TamilNadu has been a state with progressive socialpolicies amp high stateprivate investments in education Not without pitfalls but one of the best in India If states were not involved in framing NEP as alleged hereIm very surprised
After tokenization -->  ['historically', 'tamilnadu', 'has', 'been', 'a', 'state', 'with', 'progressive', 'socialpolicies', 'amp', 'high', 'stateprivate', 'investments', 'in', 'education', 'not', 'without', 'pitfalls', 'but', 'one', 'of', 'the', 'best', 'in', 'india', 'if', 'states', 'were', 'not', 'involved', 'in', 'framing', 'nep', 'as', 'alleged', 'hereim', 'very', 'surprised']


### Remove stopwords

In [None]:
stopword = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df1['Tweet_nonstop'] = df1['Tweet_tokenized'].apply(lambda x: remove_stopwords(x))
df1.head()

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized,Tweet_nonstop
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"[historically, tamilnadu, has, been, a, state,...","[historically, tamilnadu, state, progressive, ..."
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j..."
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan..."
3,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1,PMOIndia’s antidemocratic NEP was pgs in dra...,"[pmoindia, s, antidemocratic, nep, was, pgs, i...","[pmoindia, antidemocratic, nep, pgs, draft, so..."
4,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0,The latest The Nursing Education Daily httpned...,"[the, latest, the, nursing, education, daily, ...","[latest, nursing, education, daily, httpnedsky..."


In [None]:
# Example of the processed tweet

print("Before stopwords removal --> ",df1.iloc[0,4])
print("After stopwords removal --> ",df1.iloc[0,5])

Before stopwords removal -->  ['historically', 'tamilnadu', 'has', 'been', 'a', 'state', 'with', 'progressive', 'socialpolicies', 'amp', 'high', 'stateprivate', 'investments', 'in', 'education', 'not', 'without', 'pitfalls', 'but', 'one', 'of', 'the', 'best', 'in', 'india', 'if', 'states', 'were', 'not', 'involved', 'in', 'framing', 'nep', 'as', 'alleged', 'hereim', 'very', 'surprised']
After stopwords removal -->  ['historically', 'tamilnadu', 'state', 'progressive', 'socialpolicies', 'amp', 'high', 'stateprivate', 'investments', 'education', 'without', 'pitfalls', 'one', 'best', 'india', 'states', 'involved', 'framing', 'nep', 'alleged', 'hereim', 'surprised']



### Stemming - Tranforming any form of a word to its root word

Ex - progressive, progress

In [None]:
ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

df1['Tweet_stemmed'] = df1['Tweet_nonstop'].apply(lambda x: stemming(x))
df1.head()

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized,Tweet_nonstop,Tweet_stemmed
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"[historically, tamilnadu, has, been, a, state,...","[historically, tamilnadu, state, progressive, ...","[histor, tamilnadu, state, progress, socialpol..."
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j..."
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan..."
3,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1,PMOIndia’s antidemocratic NEP was pgs in dra...,"[pmoindia, s, antidemocratic, nep, was, pgs, i...","[pmoindia, antidemocratic, nep, pgs, draft, so...","[pmoindia, antidemocrat, nep, pg, draft, someh..."
4,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0,The latest The Nursing Education Daily httpned...,"[the, latest, the, nursing, education, daily, ...","[latest, nursing, education, daily, httpnedsky...","[latest, nurs, educ, daili, httpnedskyscapecom..."


In [None]:
# Example of the processed tweet

print("Before stemming --> ",df1.iloc[0,5])
print("After stemming --> ",df1.iloc[0,6])

Before stemming -->  ['historically', 'tamilnadu', 'state', 'progressive', 'socialpolicies', 'amp', 'high', 'stateprivate', 'investments', 'education', 'without', 'pitfalls', 'one', 'best', 'india', 'states', 'involved', 'framing', 'nep', 'alleged', 'hereim', 'surprised']
After stemming -->  ['histor', 'tamilnadu', 'state', 'progress', 'socialpolici', 'amp', 'high', 'statepriv', 'invest', 'educ', 'without', 'pitfal', 'one', 'best', 'india', 'state', 'involv', 'frame', 'nep', 'alleg', 'hereim', 'surpris']


### Lemmatization - converting the words to a meaningful form based on the context

In [None]:
wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

df1['Tweet_lemmatized'] = df1['Tweet_nonstop'].apply(lambda x: lemmatizer(x))
df1.head()

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized,Tweet_nonstop,Tweet_stemmed,Tweet_lemmatized
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"[historically, tamilnadu, has, been, a, state,...","[historically, tamilnadu, state, progressive, ...","[histor, tamilnadu, state, progress, socialpol...","[historically, tamilnadu, state, progressive, ..."
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j...","[hellostrangerep, harrisonford, georgefloyd, j..."
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan...","[cbseindia, pmoindia, narendramodi, drrpnishan..."
3,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1,PMOIndia’s antidemocratic NEP was pgs in dra...,"[pmoindia, s, antidemocratic, nep, was, pgs, i...","[pmoindia, antidemocratic, nep, pgs, draft, so...","[pmoindia, antidemocrat, nep, pg, draft, someh...","[pmoindia, antidemocratic, nep, pgs, draft, so..."
4,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0,The latest The Nursing Education Daily httpned...,"[the, latest, the, nursing, education, daily, ...","[latest, nursing, education, daily, httpnedsky...","[latest, nurs, educ, daili, httpnedskyscapecom...","[latest, nursing, education, daily, httpnedsky..."


In [None]:
# Example of the processed tweet

print("Before lemmatization --> ",df1.iloc[0,6])
print("After lemmatization --> ",df1.iloc[0,7])

Before lemmatization -->  ['histor', 'tamilnadu', 'state', 'progress', 'socialpolici', 'amp', 'high', 'statepriv', 'invest', 'educ', 'without', 'pitfal', 'one', 'best', 'india', 'state', 'involv', 'frame', 'nep', 'alleg', 'hereim', 'surpris']
After lemmatization -->  ['historically', 'tamilnadu', 'state', 'progressive', 'socialpolicies', 'amp', 'high', 'stateprivate', 'investment', 'education', 'without', 'pitfall', 'one', 'best', 'india', 'state', 'involved', 'framing', 'nep', 'alleged', 'hereim', 'surprised']


### Vectorisation

<br>

__Cleaning data in single line through passing clean_text in the CountVectorizer__

In [None]:
def clean_text(text):
    text_lc = "".join([word.lower() for word in text if word not in string.punctuation]) # puntuation removal and lower case conversion
    text_rc = re.sub('[0-9]+', '', text_lc) 
    tokens = re.split('\W+', text_rc)    # tokenization
    text = [ps.stem(word) for word in tokens if word not in stopword]  # remove stopwords and stemming
    return text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
countVectorizer = CountVectorizer(analyzer=clean_text) 
countVector = countVectorizer.fit_transform(df1['Tweet'])
print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

6858 Number of tweets has 13315 words


__Saving the cleaned data into a csv file__

In [None]:
df1.to_csv("cleaned1.csv")
from google.colab import files
files.download('cleaned1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Importing the cleaned dataset

In [None]:
new_data = pd.read_csv("cleaned1.csv",index_col=[0])
new_data.head(3)

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized,Tweet_nonstop,Tweet_stemmed,Tweet_lemmatized
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"['historically', 'tamilnadu', 'has', 'been', '...","['historically', 'tamilnadu', 'state', 'progre...","['histor', 'tamilnadu', 'state', 'progress', '...","['historically', 'tamilnadu', 'state', 'progre..."
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo..."
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr..."


### Calculating the polarity and subjectivity on the cleaned data.

In [None]:
from textblob import TextBlob
# Adds polarity and subjectivity column  
new_data["Polarity"] = new_data["Tweet_lemmatized"].apply(lambda word: TextBlob(word).sentiment.polarity)
new_data["Subjectivity"] = new_data["Tweet_lemmatized"].apply(lambda word: TextBlob(word).sentiment.subjectivity)

In [None]:
new_data.head()

Unnamed: 0,Date_of_tweet,Tweet,Likes_on_tweet,Tweet_punct,Tweet_tokenized,Tweet_nonstop,Tweet_stemmed,Tweet_lemmatized,Polarity,Subjectivity
0,2020-08-04 23:31:59+00:00,Historically #TamilNadu has been a state with ...,1,Historically TamilNadu has been a state with p...,"['historically', 'tamilnadu', 'has', 'been', '...","['historically', 'tamilnadu', 'state', 'progre...","['histor', 'tamilnadu', 'state', 'progress', '...","['historically', 'tamilnadu', 'state', 'progre...",0.232,0.368
1,2020-08-04 23:31:56+00:00,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...,0,HelloStrangerEP HarrisonFord GeorgeFloyd JiCha...,"['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo...","['hellostrangerep', 'harrisonford', 'georgeflo...",0.0,0.0
2,2020-08-04 22:22:30+00:00,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...,1,cbseindia PMOIndia narendramodi DrRPNishank AB...,"['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr...","['cbseindia', 'pmoindia', 'narendramodi', 'drr...",-0.015909,0.394886
3,2020-08-04 22:20:56+00:00,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...,1,PMOIndia’s antidemocratic NEP was pgs in dra...,"['pmoindia', 's', 'antidemocratic', 'nep', 'wa...","['pmoindia', 'antidemocratic', 'nep', 'pgs', '...","['pmoindia', 'antidemocrat', 'nep', 'pg', 'dra...","['pmoindia', 'antidemocratic', 'nep', 'pgs', '...",0.375,0.75
4,2020-08-04 21:01:14+00:00,The latest The Nursing Education Daily! http:/...,0,The latest The Nursing Education Daily httpned...,"['the', 'latest', 'the', 'nursing', 'education...","['latest', 'nursing', 'education', 'daily', 'h...","['latest', 'nurs', 'educ', 'daili', 'httpnedsk...","['latest', 'nursing', 'education', 'daily', 'h...",0.233333,0.366667


In [None]:
# Display only the Polarity and Subjectivity columns along with tweet

new_data[["Polarity","Subjectivity","Tweet"]].head(20)

Unnamed: 0,Polarity,Subjectivity,Tweet
0,0.232,0.368,Historically #TamilNadu has been a state with ...
1,0.0,0.0,#HelloStrangerEP6 #HarrisonFord #GeorgeFloyd #...
2,-0.015909,0.394886,@cbseindia29 @PMOIndia @narendramodi @DrRPNish...
3,0.375,0.75,@PMOIndia’s anti-democratic #NEP2020 was 400+ ...
4,0.233333,0.366667,The latest The Nursing Education Daily! http:/...
5,0.1,0.6,With Rafale India has strengthened it's defenc...
6,0.0,0.0,as soon as this pandemic will end I'm going ba...
7,0.35,0.558333,Finally those adverts of Amity University and ...
8,0.0,0.0,"Bollywood is not HINDI Film industry, It's urd..."
9,0.066667,0.133333,NEP 2020 can propel India's emergence as an ed...


In [None]:
new_data.to_csv("cleaned1.csv")

In [None]:
new_data.describe()

Unnamed: 0,Likes_on_tweet,Polarity,Subjectivity
count,6858.0,6858.0,6858.0
mean,13.982794,0.120068,0.358426
std,85.128785,0.225005,0.271701
min,0.0,-1.0,0.0
25%,0.0,0.0,0.1
50%,1.0,0.070346,0.391667
75%,3.0,0.234659,0.5125
max,2694.0,1.0,1.0
