In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import folium
from folium.plugins import HeatMap
import random
from IPython.display import IFrame, display
import ipywidgets as widgets

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\creat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\creat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\creat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
file_path = r"C:\Users\creat\Downloads\Tweets.csv"
tweets_df = pd.read_csv(file_path)
print(tweets_df.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocess_ttweet(tweet):
    # Remove URLs, mentions, hashtags, and punctuation
    tweet = re.sub(r"http\S+|www\S+|https\S+|@\S+|#\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@w+|\#', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.lower()
    tokens = word_tokenize(tweet)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(lemmatized_tokens)

In [5]:
tweets_df['processed_text'] = tweets_df['text'].apply(preprocess_ttweet)


In [6]:
emotion_mapping = {'positive': 'positive', 'neutral': 'neutral', 'negative': 'negative'}
tweets_df['emotion'] = tweets_df['airline_sentiment'].map(emotion_mapping)


In [7]:
X = tweets_df['processed_text']
y = tweets_df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)



In [11]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')


['sentiment_model.pkl']

In [12]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.82      0.94      0.88      1889
     neutral       0.66      0.48      0.56       580
    positive       0.81      0.61      0.70       459

    accuracy                           0.80      2928
   macro avg       0.77      0.68      0.71      2928
weighted avg       0.79      0.80      0.78      2928



In [19]:
if 'tweet_coord' not in tweets_df.columns or tweets_df['tweet_coord'].isna().all():
    def random_coordinates(num):
        return [[random.uniform(-90, 90), random.uniform(-180, 180)] for _ in range(num)]
    coordinates = random_coordinates(len(tweets_df))
    tweets_df['tweet_coord'] = coordinates
else:
    tweets_df['tweet_coord'] = tweets_df['tweet_coord'].apply(lambda x: eval(x) if pd.notna(x) else [None, None])


In [20]:
tweets_df = tweets_df.dropna(subset=['tweet_coord'])


In [21]:
heat_data = [[row['tweet_coord'][0], row['tweet_coord'][1], 1] for index, row in tweets_df.iterrows() if row['tweet_coord'][0] is not None and row['tweet_coord'][1] is not None]


In [22]:
m = folium.Map(location=[20, 0], zoom_start=2)
HeatMap(heat_data).add_to(m)
heatmap_path = 'heatmap.html'
m.save(heatmap_path)


In [23]:
display(IFrame(heatmap_path, width=700, height=500))


In [24]:
def analyze_sentiment(text):
    preprocessed_text = preprocess_ttweet(text)
    text_tfidf = vectorizer.transform([preprocessed_text])
    sentiment = model.predict(text_tfidf)[0]
    return sentiment

In [31]:
input_text = widgets.Text(
    value='',  # You can set a default value here
    placeholder='Enter your tweet here',  # Placeholder text
    description='Tweet:',
    disabled=False
)

output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        user_text = input_text.value
        predicted_sentiment = analyze_sentiment(user_text)
        print("Predicted sentiment:", predicted_sentiment)

button = widgets.Button(description="Analyze Sentiment")
button.on_click(on_button_click)

display(input_text, button, output)


Text(value='', description='Tweet:', placeholder='Enter your tweet here')

Button(description='Analyze Sentiment', style=ButtonStyle())

Output()

In [13]:
pip install streamlit-folium

Collecting streamlit-folium
  Downloading streamlit_folium-0.20.0-py3-none-any.whl.metadata (416 bytes)
Downloading streamlit_folium-0.20.0-py3-none-any.whl (326 kB)
   ---------------------------------------- 0.0/326.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/326.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/326.7 kB ? eta -:--:--
   - -------------------------------------- 10.2/326.7 kB ? eta -:--:--
   --- ----------------------------------- 30.7/326.7 kB 163.8 kB/s eta 0:00:02
   --- ----------------------------------- 30.7/326.7 kB 163.8 kB/s eta 0:00:02
   --- ----------------------------------- 30.7/326.7 kB 163.8 kB/s eta 0:00:02
   --- ----------------------------------- 30.7/326.7 kB 163.8 kB/s eta 0:00:02
   --- ----------------------------------- 30.7/326.7 kB 163.8 kB/s eta 0:00:02
   ----- ---------------------------------- 41.0/326.7 kB 85.6 kB/s eta 0:00:04
   ----- ---------------------------------- 41.0/326.7 kB 85.6 kB/s