In [None]:
import tweepy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# --- 1. Twitter API Setup ---
# **IMPORTANT: Replace with your own Twitter API credentials**
consumer_key = "YOUR_CONSUMER_KEY"
consumer_secret = "YOUR_CONSUMER_SECRET"
access_token = "YOUR_ACCESS_TOKEN"
access_token_secret = "YOUR_ACCESS_TOKEN_SECRET"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# --- 2. Data Collection ---
hashtag = "#machinelearning"  # Define the hashtag to search for
num_tweets = 1000  # Define the number of tweets to collect

tweets = tweepy.Cursor(api.search_tweets, q=hashtag, lang="en", tweet_mode='extended').items(num_tweets)

tweet_data = []
for tweet in tweets:
    tweet_data.append([tweet.created_at, tweet.full_text, tweet.retweet_count, tweet.favorite_count,
                       tweet.user.followers_count, tweet.user.verified, tweet.entities['hashtags']])

columns = ['created_at', 'text', 'retweet_count', 'favorite_count', 'user_followers_count', 'user_verified', 'hashtags']
df = pd.DataFrame(tweet_data, columns=columns)

# Save the data to a CSV file (optional)
df.to_csv(f"{hashtag}_tweets.csv", index=False)

# --- 3. Data Exploration and Preprocessing ---
# (If loading from CSV: df = pd.read_csv("your_hashtag_tweets.csv"))

print(df.info())
print(df.describe())
print(df.isnull().sum())

plt.figure(figsize=(10, 6))
sns.histplot(df['retweet_count'], bins=50)
plt.xlabel("Retweet Count")
plt.ylabel("Frequency")
plt.title("Distribution of Retweet Counts")
plt.show()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# --- 4. Feature Engineering ---
df['num_hashtags'] = df['text'].apply(lambda x: len([c for c in x if c == '#']))
df['num_mentions'] = df['text'].apply(lambda x: len([c for c in x if c == '@']))
df['num_urls'] = df['text'].apply(lambda x: len(re.findall(r"http\S+|www\S+|https\S+", x)))
df['text_length'] = df['text'].apply(len)

def avg_word_length(text):
    words = text.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

df['avg_word_length'] = df['cleaned_text'].apply(avg_word_length)
df['created_at'] = pd.to_datetime(df['created_at'])
df['day_of_week'] = df['created_at'].dt.dayofweek
df['hour_of_day'] = df['created_at'].dt.hour

# --- 5. Model Training ---
features = ['num_hashtags', 'num_mentions', 'num_urls', 'text_length', 'avg_word_length',
            'day_of_week', 'hour_of_day', 'user_followers_count', 'user_verified']
target = 'retweet_count'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

pipeline.fit(X_train, y_train)

# --- 6. Model Evaluation ---
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Retweet Count")
plt.ylabel("Predicted Retweet Count")
plt.title("Actual vs. Predicted Retweet Count")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red')
plt.show()

# --- 7. Interpretation and Insights ---
coefficients = pd.DataFrame({'feature': features, 'coefficient': pipeline.named_steps['model'].coef_})
coefficients = coefficients.sort_values('coefficient', ascending=False)
print(coefficients)