## sentiment analysis twitter dataset

In [1]:
import nltk.sentiment
import nltk.sentiment.vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
import pandas as pd


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
nltk.download('stopwords')
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Read File

In [3]:
tweet=pd.read_csv(r"C:\Users\ASUS\Downloads\Practice_Twittersent.csv")

The code snippets provide functions for processing tweets and building frequency dictionaries based on sentiment labels.

In [4]:

def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweet):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs




In [5]:
tweet['Heading'] = tweet['Heading'].apply(lambda x: process_tweet(x))
##tweet['Heading_pree'] = tweet['Heading'].apply(lambda x: process_tweet(' '.join(x)))


In [6]:
print(tweet)

                                                 Heading Sentiment
0      [upset, can't, updat, facebook, text, ..., mig...  Negative
1      [dive, mani, time, ball, manag, save, 50, rest...  Positive
2                 [whole, bodi, feel, itchi, like, fire]  Negative
3                          [behav, i'm, mad, can't, see]  Negative
4                                          [whole, crew]   Neutral
...                                                  ...       ...
13995                 [glad, safe, worri, prairi, chick]  Negative
13996    [grrr, rememb, best, go, earli, belfast, kelli]  Positive
13997  [bed, time, stuart', wembley, tomorrow, wish, ...  Positive
13998  [think, mum, help, suggest, jude, law, semest,...  Negative
13999                                 [can't, ..., bore]  Positive

[14000 rows x 2 columns]


The code snippet you've provided is defining a function called get_value that converts sentiment labels into numerical values and then applies this function to a pandas DataFrame column.

In [7]:
def get_value (x):
    if x == 'Positive':
        return 1
    else:
        return 0
tweet['type'] = tweet['Sentiment'].apply(get_value)

In [8]:
print(tweet)

                                                 Heading Sentiment  type
0      [upset, can't, updat, facebook, text, ..., mig...  Negative     0
1      [dive, mani, time, ball, manag, save, 50, rest...  Positive     1
2                 [whole, bodi, feel, itchi, like, fire]  Negative     0
3                          [behav, i'm, mad, can't, see]  Negative     0
4                                          [whole, crew]   Neutral     0
...                                                  ...       ...   ...
13995                 [glad, safe, worri, prairi, chick]  Negative     0
13996    [grrr, rememb, best, go, earli, belfast, kelli]  Positive     1
13997  [bed, time, stuart', wembley, tomorrow, wish, ...  Positive     1
13998  [think, mum, help, suggest, jude, law, semest,...  Negative     0
13999                                 [can't, ..., bore]  Positive     1

[14000 rows x 3 columns]


In [9]:
tweet.type.value_counts()

type
0    9665
1    4335
Name: count, dtype: int64

In [10]:
len(tweet.type)

14000

In [11]:
tweet.head(5)

Unnamed: 0,Heading,Sentiment,type
0,"[upset, can't, updat, facebook, text, ..., mig...",Negative,0
1,"[dive, mani, time, ball, manag, save, 50, rest...",Positive,1
2,"[whole, bodi, feel, itchi, like, fire]",Negative,0
3,"[behav, i'm, mad, can't, see]",Negative,0
4,"[whole, crew]",Neutral,0


The code snippet you've provided uses scikit-learn's train_test_split function to split a dataset into training and testing sets

In [12]:
from sklearn.model_selection import train_test_split
X=tweet.Heading
y=tweet.type
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)


In [13]:

len(X_test)

2800

The provided code snippet demonstrates how to use CountVectorizer from scikit-learn to convert a list of strings into a matrix of token counts.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
X_train_strings = [' '.join(tokens) for tokens in X_train]
v = CountVectorizer()

X_train_cv = v.fit_transform(X_train_strings)
print(X_train_cv)

  (0, 8185)	1
  (0, 1819)	1
  (0, 7932)	1
  (0, 10818)	1
  (1, 4838)	1
  (1, 1074)	1
  (1, 4397)	1
  (1, 607)	1
  (1, 7335)	1
  (2, 4067)	1
  (2, 1247)	1
  (2, 6374)	1
  (2, 1226)	1
  (2, 5541)	1
  (3, 1737)	1
  (3, 9154)	1
  (4, 3261)	1
  (4, 8260)	1
  (4, 4847)	1
  (5, 4067)	1
  (5, 4665)	1
  (5, 6780)	1
  (5, 10160)	1
  (5, 5950)	1
  (5, 3481)	1
  :	:
  (11196, 2207)	1
  (11196, 8201)	1
  (11196, 9130)	1
  (11196, 4348)	1
  (11196, 7909)	1
  (11196, 1131)	1
  (11196, 7091)	1
  (11196, 10299)	1
  (11197, 9679)	1
  (11197, 7086)	1
  (11197, 10047)	1
  (11198, 2574)	1
  (11198, 997)	1
  (11198, 4409)	1
  (11198, 10011)	1
  (11198, 524)	1
  (11198, 9973)	1
  (11198, 5027)	1
  (11198, 3657)	1
  (11198, 9307)	1
  (11198, 4996)	1
  (11199, 10792)	1
  (11199, 964)	1
  (11199, 6603)	1
  (11199, 8091)	1


In [15]:
X_train_cv[:2599]

<2599x10898 sparse matrix of type '<class 'numpy.int64'>'
	with 18205 stored elements in Compressed Sparse Row format>

In [16]:
print(X_train_strings)

["i'm sad ... caus repli yet ...", '.. im bare hang almost playyy', 'get bike much better leg', 'canada suck', 'enough say imiss u', 'hope obvious usag massiv factor current get 12 hour n95', "ye yall g'day well smile", 'dream tri tell friend found child w someon care', 'back rofl andyz dick still hurt rofl poor guy', "sittin w kayla realli realli reallli dis-lik brother know stop i'm wait day one", "anyon want buy dear unemploy friend darryl austin citi limit 3 day ticket guess i'll sit year", 'watch mow lawn feel useless unabl help', 'sad sad day im sell one puppi today', 'u tweet verizon fio problem crap sign', 'know eat dinner pizza ;-)', 'impress rain want go market shop bleh', "competit i'm gonna look like crap", 'feel like head gunna explod', 'make happi wish afraid fli', "weekend chore done get readi snowbird' bachelorett parti attempt get work work done", '<3', 'thank invit bowl today', 'quarter percent babe darn', 'sm tweet still fail twitter act support request either', "mor

CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming X_train is a Pandas DataFrame or Series with text data
# Convert X_train to a list of strings
X_train_list = X_train.astype(str).tolist()

# Initialize CountVectorizer
v = CountVectorizer()

# Fit and transform X_train_list
X_train_cv = v.fit_transform(X_train_list)

# Print the transformed data
print(X_train_cv)


  (0, 8185)	1
  (0, 1819)	1
  (0, 7932)	1
  (0, 10818)	1
  (1, 4838)	1
  (1, 1074)	1
  (1, 4397)	1
  (1, 607)	1
  (1, 7335)	1
  (2, 4067)	1
  (2, 1247)	1
  (2, 6374)	1
  (2, 1226)	1
  (2, 5541)	1
  (3, 1737)	1
  (3, 9154)	1
  (4, 3261)	1
  (4, 8260)	1
  (4, 4847)	1
  (5, 4067)	1
  (5, 4665)	1
  (5, 6780)	1
  (5, 10160)	1
  (5, 5950)	1
  (5, 3481)	1
  :	:
  (11196, 2207)	1
  (11196, 8201)	1
  (11196, 9130)	1
  (11196, 4348)	1
  (11196, 7909)	1
  (11196, 1131)	1
  (11196, 7091)	1
  (11196, 10299)	1
  (11197, 9679)	1
  (11197, 7086)	1
  (11197, 10047)	1
  (11198, 2574)	1
  (11198, 997)	1
  (11198, 4409)	1
  (11198, 10011)	1
  (11198, 524)	1
  (11198, 9973)	1
  (11198, 5027)	1
  (11198, 3657)	1
  (11198, 9307)	1
  (11198, 4996)	1
  (11199, 10792)	1
  (11199, 964)	1
  (11199, 6603)	1
  (11199, 8091)	1


Bellow steps demonstrates the use of a Machine Learning model to classify textual data based on the token counts generated by CountVectorizer.

MultinomialNB

In [18]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [19]:
# Assuming X_test is a list of lists, where each inner list represents tokens
# Convert each inner list of tokens into a single string
X_test_strings = [' '.join(tokens) for tokens in X_test]

# Transform X_test_strings using the CountVectorizer
X_test_cv = v.transform(X_test_strings)


In [20]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1915
           1       0.76      0.54      0.63       885

    accuracy                           0.80      2800
   macro avg       0.79      0.73      0.75      2800
weighted avg       0.80      0.80      0.79      2800



DecisionTreeClassifier

In [21]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train_cv, y_train)

In [22]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1915
           1       0.69      0.69      0.69       885

    accuracy                           0.80      2800
   macro avg       0.77      0.77      0.77      2800
weighted avg       0.80      0.80      0.80      2800



RandomForestClassifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train_cv, y_train)

In [24]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1915
           1       0.74      0.61      0.67       885

    accuracy                           0.81      2800
   macro avg       0.79      0.75      0.77      2800
weighted avg       0.80      0.81      0.80      2800



SVC

In [25]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train_cv, y_train)

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1915
           1       0.81      0.59      0.69       885

    accuracy                           0.83      2800
   macro avg       0.82      0.76      0.78      2800
weighted avg       0.83      0.83      0.82      2800



The provided code demonstrates how to use a trained Multinomial Naive Bayes model to predict the sentiment

In [27]:
x = [
    'vodka'
]

x_count = v.transform(x)
result = model.predict(x_count)

if result == 1:
    print("Positive tweet")
else:
    print("Negative tweet")


Negative tweet
