# Web and Social Media Analytics Assignment

### Import the data and the packages

In [1]:
import numpy as np 
import pandas as pd 
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# nlp lib
import sys
import nltk 
nltk.download('omw-1.4')
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize   
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string, collections, unicodedata
!pip install gensim==3.6.0
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec # library for text analysis
from gensim.summarization import summarize
from gensim.summarization import keywords

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/riccardopandolfi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Collecting gensim==3.6.0
  Using cached gensim-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.1
    Uninstalling gensim-4.3.1:
      Successfully uninstalled gensim-4.3.1
Successfully installed gensim-3.6.0


In [2]:
# Import the two datasets
donald_trump = pd.read_csv('hashtag_donaldtrump.csv', lineterminator='\n')
joe_biden = pd.read_csv('hashtag_joebiden.csv', lineterminator='\n')

# Add the column "candidate" to know who the tweet was referring to
donald_trump.loc[:,'candidate'] = 'Donald Trump'
joe_biden.loc[:,'candidate'] = 'Joe Biden'

###  Data Cleaning

In [3]:
# Delete the rows with missing values
donald_trump=donald_trump.dropna()
joe_biden=joe_biden.dropna()

In [4]:
# Change the country name to have consistent data
d = {"United States of America":"United States"}
donald_trump['country'].replace(d, inplace=True)
joe_biden['country'].replace(d, inplace=True)

# Only consider tweets from the US
donald_trump = donald_trump.loc[donald_trump['country'] == "United States"]
joe_biden = joe_biden.loc[joe_biden['country'] == "United States"]

In [5]:
# Create functions to remove stop words and lemmatize
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean(text):
    # Lowercase
    text = str(text).lower()
    # Remove special text in brackets ([chorus],[guitar],etc)
    text = re.sub('\[.*?\]', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)    
    # Remove quotes
    text = re.sub('[‘’“”…]', '', text)
    # Remove new line \n 
    text = re.sub('\n', ' ', text)
    # Remove URLs
    text = re.sub(r"http\S+", "", text) 
    # Remove stop_word
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

from nltk import word_tokenize, pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def lemmatize_tag(text):
    lemma=[]
    for i,j in pos_tag(word_tokenize(text)) :
        p=j[0].lower()
        lm = ''
        if p in ['j','n','v']:
            if p == 'j':
                p = 'a'
            lm = wnl.lemmatize(i,p)
            
        else :
            lm = wnl.lemmatize(i)
        if len(lm) > 1:
            lemma.append(lm)
    return ' '.join(lemma)

def remove_emoji(text):
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002700-\U000027BF"  # Dingbats
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/riccardopandolfi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/riccardopandolfi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Add a clean_tweet column 
donald_trump['cleaned_tweet'] = donald_trump['tweet'].apply(clean)
donald_trump['cleaned_tweet'] = donald_trump['cleaned_tweet'].apply(lemmatize_tag)
donald_trump['cleaned_tweet'] = donald_trump['cleaned_tweet'].apply(remove_emoji)

joe_biden['cleaned_tweet'] = joe_biden['tweet'].apply(clean)
joe_biden['cleaned_tweet'] = joe_biden['cleaned_tweet'].apply(lemmatize_tag)
joe_biden['cleaned_tweet'] = joe_biden['cleaned_tweet'].apply(remove_emoji)

### Setiment Analysis

In [None]:
# Add polarity neg/positive labels
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

donald_trump["scores"] = donald_trump["cleaned_tweet"].apply(lambda cleaned_tweet: sid.polarity_scores(cleaned_tweet))
joe_biden["scores"] = joe_biden["cleaned_tweet"].apply(lambda cleaned_tweet: sid.polarity_scores(cleaned_tweet))


In [None]:
# Add neg/positive compound and labels
donald_trump["compound"] = donald_trump["scores"].apply(lambda score_dict: score_dict["compound"])
joe_biden["compound"] = joe_biden["scores"].apply(lambda score_dict: score_dict["compound"])

donald_trump["comp_score"] = donald_trump["compound"].apply(lambda c: "pos" if c >= 0.3 else "neg")
joe_biden["comp_score"] = joe_biden["compound"].apply(lambda c: "pos" if c >= 0.3 else "neg")

In [None]:
# Concatenate the two datasets
df = pd.concat([joe_biden,donald_trump])

In [None]:
df.cleaned_tweet.tolist()

### Exloratory Data Analysis

#### Number of Tweets by Candidate

In [None]:
# Group the tweets by candidate and count the number of tweets
tweet_counts = df.groupby('candidate')['tweet_id'].count()

# create a bar chart of the tweet counts for each candidate using plotly graph objects
fig = go.Figure()
fig.add_trace(go.Bar(
    x=tweet_counts.index,
    y=tweet_counts,
    text=tweet_counts,
    textposition='auto',
    marker_color=['#E9141D', '#00308F']
))
fig.update_layout(
    title="Number of Tweets by Candidate",
    xaxis_title="Candidate",
    yaxis_title="Number of Tweets",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

#### Number of Tweets by State

In [None]:
y = df.query('(candidate == "Joe Biden") & (country == "United States")').dropna(subset=['state','country']).groupby(by='state').count().tweet.sort_values(ascending=False)
x = df.query('(candidate == "Joe Biden") & (country == "United States")').dropna(subset=['state','country']).groupby(by='state').count().tweet.sort_values(ascending=False).index
y2 = df.query('(candidate == "Donald Trump") & (country == "United States")').dropna(subset=['state','country']).groupby(by='state').count().tweet.sort_values(ascending=False)
x2 = df.query('(candidate == "Donald Trump") & (country == "United States")').dropna(subset=['state','country']).groupby(by='state').count().tweet.sort_values(ascending=False).index
fig = go.Figure([go.Bar(x=x, y=y, name='Joe Biden'),
                 go.Bar(x=x2, y=y2, name='Donald Trump')])

# Customize aspect

fig.update_layout(
    title="Number of Tweets by Candidate and by State",
    xaxis_title="States",
    yaxis_title="Number of Tweets",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

#### Number of Tweets by Day

In [None]:
### Number of tweets per day by candidate
# Convert 'created_at' column to datetime type
df['created_at'] = pd.to_datetime(df['created_at'])

# Create two dataframes for each candidate
trump_df = df[df['candidate'] == 'Donald Trump']
biden_df = df[df['candidate'] == 'Joe Biden']

# Group tweets by date and count number of tweets per date
trump_count = trump_df.groupby(pd.Grouper(key='created_at', freq='D')).size()
biden_count = biden_df.groupby(pd.Grouper(key='created_at', freq='D')).size()

# Create figure with two traces for each candidate
fig = go.Figure()
fig.add_trace(go.Bar(x=trump_count.index, y=trump_count, name='Donald Trump', marker_color='#E9141D'))
fig.add_trace(go.Bar(x=biden_count.index, y=biden_count, name='Joe Biden', marker_color='#00308F'))

# Update figure layout
fig.update_layout(title="Number of Tweets Per Day And By Candidate",
    xaxis_title="Date",
    yaxis_title="Number of Tweets",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"))

# Show figure
fig.show()

#### Positive Joe Biden Tweets by State

In [None]:
import plotly.express as px
import json

import os
import urllib.request

url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/us-states.json'
filename = 'us-states.json'

if not os.path.isfile(filename):
    urllib.request.urlretrieve(url, filename)

# Filter for Joe Biden tweets and positive compound scores
biden_pos = df[(df['candidate'] == 'Joe Biden') & (df['comp_score'] == 'pos')]

# Group by state and count number of tweets
biden_pos_counts = biden_pos.groupby('state_code').size().reset_index(name='count')

# Load US states GeoJSON file
with open('us-states.json') as f:
    states = json.load(f)

# Create choropleth map
fig = px.choropleth(biden_pos_counts, 
                    locations='state_code', 
                    geojson=states, 
                    color='count',
                    scope="usa",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    labels={'count':'Number of Tweets'})
fig.update_layout(title_text='Positive Joe Biden Tweets by State',  font=dict(
        family="Courier New, monospace",
        size=12,
        color="#7f7f7f"))
fig.show()

#### Number of Positive and Negative Tweets Before the Election

In [None]:
# Filter for tweets created before 2020-11-04 00:00:00
df_filtered = df[df['created_at'] < '2020-11-04 00:00:00']

# Group by candidate and comp_score and count number of tweets
counts = df_filtered.groupby(['candidate', 'comp_score']).size().reset_index(name='count')

# Pivot the data to have positive and negative counts as separate columns
counts_pivot = counts.pivot(index='candidate', columns='comp_score', values='count').reset_index()

# Rename columns
counts_pivot = counts_pivot.rename(columns={'pos': 'Positive Tweets', 'neg': 'Negative Tweets'})

# Create bar chart
fig = go.Figure()
fig.add_trace(go.Bar(x=counts_pivot['candidate'], y=counts_pivot['Positive Tweets'],
                     name='Positive Tweets', marker_color='#E4A0F7'))
fig.add_trace(go.Bar(x=counts_pivot['candidate'], y=counts_pivot['Negative Tweets'],
                     name='Negative Tweets', marker_color='#414141'))
fig.update_layout(title='Number of Positive and Negative Tweets Before The Election',
                  xaxis_title='Candidate', yaxis_title='Number of Tweets', font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"))
fig.show()

#### Sentiment Change Over Time

In [None]:
# Round the date to the nearest hour
df['round_date'] = pd.to_datetime(df['created_at']).dt.floor('D')

# Group the tweets by date and candidate
import pandas as pd
grouped = df.groupby(['round_date', 'candidate'])

# Calculate the sentiment
sentiment = grouped['compound'].mean().unstack()

# Plot the data
sentiment.plot(color=['red', 'blue'])
plt.title('Sentiment over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.legend()
plt.show()

#### Word Clouds

In [None]:
!pip install wordcloud

In [None]:
# Create function for word clouds
from wordcloud import WordCloud, STOPWORDS
def word_cloud(wd_list):
    stopwords = set(STOPWORDS)
    all_words = ' '.join([text for text in wd_list])
    wordcloud = WordCloud(background_color='white',
                         stopwords= stopwords,
                         width = 1600,height=800,
                         random_state=1,
                         colormap='jet',
                         max_words=50,
                         max_font_size=200).generate(all_words)
    plt.figure(figsize=(12,10))
    plt.axis('off')
    plt.imshow(wordcloud)

# Donald Trump word cloud
word_cloud(donald_trump['cleaned_tweet'][:5000])

In [None]:
# Joe Biden word cloud
word_cloud(joe_biden['cleaned_tweet'][:5000])

### Word Similarity

In [None]:
tweets = df.cleaned_tweet.values

In [None]:
tweets

In [None]:
#only once
nltk.download('punkt')

In [None]:
tweetsVec = [nltk.word_tokenize(cleaned_tweet) for cleaned_tweet in tweets]

In [None]:
tweetsVec

In [None]:
model = Word2Vec(tweetsVec, min_count = 4, size=15, window=3, workers = 4, seed = 123)

In [None]:
similar_biden = model.wv.most_similar('joebiden', topn=15)
x = [i for i,j in similar_biden]
y = [j for i,j in similar_biden]

fig, ax = plt.subplots(figsize=(10,10))
sns.barplot(x=x, y=y, color="blue", saturation=.5, ax=ax)
ax.set_title('Closest words to Joe Biden')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
ax.set_ylim(np.min(y),np.max(y))
ax.set_xlabel('Words')
ax.set_ylabel('Proportion of similarities')

In [None]:
similar_trump = model.wv.most_similar('donaldtrump',topn=15)
x = [i for i,j in similar_trump]
y = [j for i,j in similar_trump]

fig, ax = plt.subplots(figsize=(10,10))
sns.barplot(x=x, y=y, color="red", saturation=.5, ax=ax)
ax.set_title('Closest words to Donald Trump')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
ax.set_ylim(np.min(y),np.max(y))
ax.set_xlabel('Words')
ax.set_ylabel('Proportion of similarities')

### Topic Modeling - LDA

#### On tweets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
df_cv = cv.fit_transform(df['cleaned_tweet'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(df_cv)

In [None]:
# Show top words per topic
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

#### On users' descriptions

In [None]:
# Add a cleaned_user_description column 
df['cleaned_user_description'] = df['user_description'].apply(clean)
df['cleaned_user_description'] = df['cleaned_user_description'].apply(lemmatize_tag)

In [None]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
df_cv = cv.fit_transform(df['cleaned_user_description'])

In [None]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(df_cv)

In [None]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

### Classifier 

In [None]:
#Prepare the dataset for the model

df = df.loc[:, ['cleaned_tweet', 'comp_score']]
df.loc[df['comp_score'].isnull(), 'comp_score'] = 0
df.loc[df['comp_score'] == 'pos', 'comp_score'] = 1
df.loc[df['comp_score'] == 'neg', 'comp_score'] = 0
df = df.dropna()
df['comp_score']= df['comp_score'].astype('int')
df.dtypes
df

In [None]:
#Train and Test splitting

from sklearn.model_selection import train_test_split

X = df["cleaned_tweet"]  # this time we want to look at the text
y = df["comp_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [None]:
#TFidVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train= vectorizer.fit_transform(
    X_train
)  # remember to use the original X_train set
X_train.shape

In [None]:
#Try Logistic Regression and Decision Tree models and see which one performs better so we can tune its parameters
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Create and train different classification models
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Decision Tree", DecisionTreeClassifier())
]

for name, model in models:
    print(f"Training {name} model...")
    model.fit(X_train, y_train)

    # Calculate metrics on the training set
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred, average="macro")
    train_recall = recall_score(y_train, y_train_pred, average="macro")
    train_f1 = f1_score(y_train, y_train_pred, average="macro")

    # Transform X_test using the vectorizer
    X_test_transformed = vectorizer.transform(X_test)
    
    y_pred = model.predict(X_test_transformed)
    
    # Evaluate the model performance using different metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    
    # Print the performance metrics for the training set
    print(f"{name} model training set results:")
    print(f"Accuracy: {train_accuracy:.2f}")
    print(f"Precision: {train_precision:.2f}")
    print(f"Recall: {train_recall:.2f}")
    print(f"F1 Score: {train_f1:.2f}")
    print("-" * 50)

    # Print the performance metrics for the test set
    print(f"{name} model test set results:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("=" * 50)

In [None]:
#Logistic Regression Model
from sklearn.model_selection import cross_val_score

# Create the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Perform cross-validation
cv_scores = cross_val_score(logreg, X_train, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores and their mean
print("Logistic Regression cross-validation accuracy scores:")
print(cv_scores)
print("Mean cross-validation accuracy:", cv_scores.mean())

# Fit the model on the entire training set
logreg.fit(X_train, y_train)

# Calculate metrics on the training set
y_train_pred = logreg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average="macro")
train_recall = recall_score(y_train, y_train_pred, average="macro")
train_f1 = f1_score(y_train, y_train_pred, average="macro")

# Transform X_test using the vectorizer
X_test_transformed = vectorizer.transform(X_test)

# Make predictions using the test set
y_pred = logreg.predict(X_test_transformed)

# Evaluate the model performance using different metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

# Print the performance metrics for the training set
print(f"Logistic Regression model training set results:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print("-" * 50)

# Print the performance metrics for the test set
print(f"Logistic Regression model test set results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
#Improve the model with GridSearch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and preprocess the dataset (assuming this part is already done)

# Split the dataset into training and test sets (assuming this part is already done)

# Vectorize the text data (assuming this part is already done)

# Define the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Create the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Create the GridSearchCV object with 2 folds
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Train the model with the best hyperparameters on the entire training set
best_logreg = grid_search.best_estimator_
best_logreg.fit(X_train, y_train)

# Calculate metrics on the training set
y_train_pred = best_logreg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average="macro")
train_recall = recall_score(y_train, y_train_pred, average="macro")
train_f1 = f1_score(y_train, y_train_pred, average="macro")

# Transform X_test using the vectorizer
X_test_transformed = vectorizer.transform(X_test)

# Make predictions using the test set
y_pred = best_logreg.predict(X_test_transformed)

# Evaluate the model performance using different metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

# Print the performance metrics for the training set
print(f"Logistic Regression model training set results:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Precision: {train_precision:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1 Score: {train_f1:.2f}")
print("-" * 50)

# Print the performance metrics for the test set
print(f"Logistic Regression model test set results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
#Create Plots
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create a confusion matrix for the test set predictions
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')

# Plot the ROC curve
from sklearn.metrics import plot_roc_curve
plot_roc_curve(best_logreg, X_test_transformed, y_test)
plt.title('ROC Curve')
plt.savefig('roc_curve.png')

# Plot the precision-recall curve
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(best_logreg, X_test_transformed, y_test)
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall_curve.png')

# Plot the feature importances
feature_importances = best_logreg.coef_.ravel()
feature_names = vectorizer.get_feature_names()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values('Importance', ascending=False)[:20]

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Important Features')
plt.savefig('feature_importances.png')



In [None]:
#Save Them
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create a confusion matrix for the test set predictions
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')

# Plot the ROC curve
from sklearn.metrics import plot_roc_curve
plot_roc_curve(best_logreg, X_test_transformed, y_test)
plt.title('ROC Curve')
plt.savefig('roc_curve.png')

# Plot the precision-recall curve
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(best_logreg, X_test_transformed, y_test)
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall_curve.png')

# Plot the feature importances
feature_importances = best_logreg.coef_.ravel()
feature_names = vectorizer.get_feature_names()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df['Class'] = ['Negative' if imp < 0 else 'Positive' for imp in importance_df['Importance']]
importance_df = importance_df.sort_values('Importance', ascending=False)[:20]

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature', hue='Class')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Important Features')
plt.savefig('feature_importances.png')


In [None]:
#Add distinct feature (words) importance for Positive and Negative tweets
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create a confusion matrix for the test set predictions
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')

# Plot the ROC curve
from sklearn.metrics import plot_roc_curve
plot_roc_curve(best_logreg, X_test_transformed, y_test)
plt.title('ROC Curve')
plt.savefig('roc_curve.png')

# Plot the precision-recall curve
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(best_logreg, X_test_transformed, y_test)
plt.title('Precision-Recall Curve')
plt.savefig('precision_recall_curve.png')

# Create a dataframe for the top 20 most relevant features for positive target variable
pos_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
pos_df['Target'] = 'Positive'
pos_df = pos_df.sort_values(['Target', 'Importance'], ascending=[True, False])[:20]

# Create a dataframe for the top 20 most relevant features for negative target variable
neg_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
neg_df['Target'] = 'Negative'
neg_df = neg_df.sort_values(['Target', 'Importance'], ascending=[True, True])[:20]

# Concatenate the two dataframes
df = pd.concat([pos_df, neg_df])

# Plot the feature importances for positive and negative target variables separately
g = sns.catplot(data=df, x='Importance', y='Feature', hue='Target', kind='bar', height=10, aspect=0.8)
g.ax.set_xlabel('Importance')
g.ax.set_ylabel('Feature')
g.ax.set_title('Top 20 Important Features')
plt.savefig('feature_importances.png')
plt.show()
