In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Load cleaned dataset with sentiment scores
df = pd.read_csv("../DATA/Clean_Tweets_With_Sentiment.csv")

In [None]:
# linear regression - do negative sentiment scores correlate with frequent words from TF-IDF?

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load the TF-IDF results and original dataset
tfidf_df = pd.read_csv("tfidf_results.csv")  # Adjust path if needed

# Ensure text and sentiment columns are named correctly
sentiment_column = "negative_sentiment_score"  # Ensure this exists in your dataset

# Merge the TF-IDF dataframe with the sentiment scores
df_combined = pd.concat([df[sentiment_column], tfidf_df], axis=1)

# Define features (X) and target (y)
X = df_combined.drop(columns=[sentiment_column])  # TF-IDF features
y = df_combined[sentiment_column]  # Negative sentiment scores

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

# Get feature importance (coefficients)
feature_importance = pd.Series(model.coef_, index=X.columns).sort_values(ascending=False)
print("Top 10 most predictive words:")
print(feature_importance.head(10))

# Save feature importance to a file (optional)
feature_importance.to_csv("word_correlation_scores.csv")

In [None]:
# word cloud for top 10 most predictive words for negative sentiment scores

import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Generate word cloud based on feature importance
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(feature_importance)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")  # Hide axes
plt.title("Top Predictive Words for Airline Negative Sentiment", fontsize=14)
plt.show()

In [None]:
# Convert feature importance dictionary to a DataFrame
word_scores = pd.DataFrame(list(feature_importance.items()), columns=["word", "vader_score"])

# Select the top 10 words with highest absolute VADER scores (strongest predictors)
top_words = word_scores.reindex(word_scores["vader_score"].abs().nlargest(10).index)

# Plot bar chart
plt.figure(figsize=(8, 5))
sns.barplot(y=top_words["word"], x=top_words["vader_score"], palette="coolwarm")

# Labels and title
plt.xlabel("VADER Sentiment Score")
plt.ylabel("Words")
plt.title("Top 10 Predictive Words")
plt.axvline(0, color='black', linewidth=1)  # Adds a vertical line at neutral (0)
plt.grid(axis="x", linestyle="--", alpha=0.5)

plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply VADER to get sentiment scores for each tweet
df["vader_score"] = df["text"].apply(lambda tweet: analyzer.polarity_scores(tweet)["compound"])

# Compute average sentiment score per airline
airline_sentiment = df.groupby("airline")["vader_score"].mean().reset_index()

# Sort airlines by sentiment for better visualization
airline_sentiment = airline_sentiment.sort_values(by="vader_score", ascending=True)

# Plot bar chart
plt.figure(figsize=(8, 5))
sns.barplot(y=airline_sentiment["airline"], x=airline_sentiment["vader_score"], palette="coolwarm")

# Labels and title
plt.xlabel("Average VADER Sentiment Score")
plt.ylabel("Airline")
plt.title("Average Sentiment Scores Across Airlines")
plt.axvline(0, color='black', linewidth=1, linestyle="--")  # Neutral sentiment reference line
plt.grid(axis="x", linestyle="--", alpha=0.5)

plt.show()

In [None]:
# is there a difference between predicted sentiment scores included in the dataset and VADER?

In [None]:
pip install statsmodels

In [None]:
import statsmodels.api as sm

# Define X (predictor) and y (outcome)
X = cdf[['negativereason_confidence']]  # Independent variable
y = cdf['negative_sentiment_score']  # Dependent variable

# Add a constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the model summary (includes p-values and confidence intervals)
print(model.summary())

In [None]:
# correlation between word frequency and word importance

from scipy.stats import pearsonr

# Compute correlation between word frequency (TF-IDF sum) and word importance (regression coefficients)
correlation, p_value = pearsonr(tfidf_df.sum(), feature_importance)

# Print correlation results
print(f"Pearson Correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

# Apply log transformation (adding 1 to avoid log(0))
sns.regplot(x=np.log1p(tfidf_df.sum()), y=feature_importance, scatter_kws={"alpha": 0.5}, line_kws={"color": "red"})

plt.xlabel("Log Word Frequency (TF-IDF Sum)")
plt.ylabel("Predictive Strength (Regression Coefficients)")
plt.title("Log-Transformed Relationship Between Word Frequency and Predictive Strength")
plt.show()

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style
sns.set_style("whitegrid")

plt.figure(figsize=(8, 5))

# Apply LOWESS smoothing
lowess = sm.nonparametric.lowess(feature_importance, tfidf_df.sum(), frac=0.3)

# Plot scatter points with transparency
plt.scatter(tfidf_df.sum(), feature_importance, alpha=0.4, color="royalblue", edgecolors="k")

# Plot LOWESS smoothed line
plt.plot(lowess[:, 0], lowess[:, 1], color="crimson", linewidth=2.5, label="LOWESS Fit")

# Labels and title
plt.xlabel("Word Frequency (TF-IDF Sum)", fontsize=12, fontweight="bold")
plt.ylabel("Predictive Strength (Regression Coefficients)", fontsize=10, fontweight="bold")
plt.title("Relationship Between Word Frequency and Predictive Strength", fontsize=14, fontweight="bold")

# Add subtle gridlines
plt.grid(color="gray", linestyle="dashed", linewidth=0.5, alpha=0.6)

# Remove top and right spines for a cleaner look
sns.despine()

# Add legend
plt.legend(frameon=False, fontsize=10)

# Show plot
plt.show()

In [None]:
keywords = ["late", "cancelled", "long lines"]

# Create a column to check if the tweet contains the keywords
for word in keywords:
    cdf[word] = cdf["text"].str.contains(word, case=False, na=False).astype(int)

# Check how often each keyword appears
keyword_counts = cdf[keywords].sum()
print(keyword_counts)

In [None]:
keyword_sentiment = cdf.groupby(keywords)["negativereason_confidence"].mean()
print(keyword_sentiment)

In [None]:
from scipy.stats import spearmanr

for word in keywords:
    correlation, p_value = spearmanr(cdf[word], cdf["negativereason_confidence"])
    print(f"Correlation between '{word}' and negative sentiment score: {correlation:.3f}, p-value: {p_value:.3f}")