In [4]:
import re, string

def clean_tweet(tweet):
    tweet = re.sub(r'@[A-Za-z0-9]+', '', tweet)  # Remove mentions
    tweet = re.sub(r'#', '', tweet)  # Remove hashtags
    tweet = re.sub(r'RT[\s]+', '', tweet)  # Remove RT
    tweet = re.sub(r'[\s]+https?://\S+', '', tweet)  # Remove URLs
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tweet = re_punc.sub('', tweet)  # Remove punctuations
    return tweet

clean_tweet("RT @user123 Loving the #sunny weather! Check this out: https://example.com")

'Loving the sunny weather Check this out'

In [22]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment score
def get_sentiment(tweet):
    sentiment_dict = analyzer.polarity_scores(tweet)
    print(sentiment_dict)
    return sentiment_dict['compound']  # compound score is a normalized score between -1 and 1

# get_sentiment("RT @JPDesloges: Why AAPL Stock Had a Mini-Flash Crash Today $AAPL #aapl http://t.co/hGFcjYa0E9")
get_sentiment("Why AAPL Stock Had a MiniFlash Crash Today AAPL aapl")

# get_sentiment("Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB")
# get_sentiment("Top 3 all  tablets Damn right")

{'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'compound': -0.4019}
{'neg': 0.277, 'neu': 0.556, 'pos': 0.167, 'compound': -0.2942}


-0.2942

In [23]:
from textblob import TextBlob

# Function to get the sentiment of a tweet
def get_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity

get_sentiment("Top 3 all @Apple #tablets. Damn right! http://t.co/RJiGn2JUuB")
get_sentiment("Top 3 all  tablets Damn right")


0.39285714285714285

Load and Explore the Data

In [22]:
import pandas as pd

# Load the data
df = pd.read_csv('stocks.csv')

# Display the first few rows of the dataframe
display(df.head())


Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Tweet,Sentiment
0,0,2020-01-02,74.059998,75.150002,73.797501,75.087502,73.059425,135480400,AAPLThe 10 best Steve Jobs emails ever,1.0
1,1,2020-01-03,74.287498,75.144997,74.125,74.357498,72.349144,146322800,Why AAPL Stock Had a MiniFlash Crash Today AA...,0.0
2,2,2020-01-06,73.447502,74.989998,73.1875,74.949997,72.925636,118387200,My cat only chews cords Such an AppleSnob,0.0
3,3,2020-01-07,74.959999,75.224998,74.370003,74.597504,72.582664,108872000,I agree with that the IndividualInvestor shou...,0.65
4,4,2020-01-08,74.290001,76.110001,74.290001,75.797501,73.750252,132079200,Nobody expects the Spanish Inquisition AAPL,0.0


Data Preprocessing

In [23]:
# Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values
display(df.isnull().sum())

# Drop missing values for simplicity
df = df.dropna()

# Ensure data is sorted by date
df = df.sort_values('Date')

# Display the cleaned dataframe
display(df.head())


Unnamed: 0    0
Date          0
Open          0
High          0
Low           0
Close         0
Adj Close     0
Volume        0
Tweet         0
Sentiment     0
dtype: int64

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Tweet,Sentiment
0,0,2020-01-02,74.059998,75.150002,73.797501,75.087502,73.059425,135480400,AAPLThe 10 best Steve Jobs emails ever,1.0
1,1,2020-01-03,74.287498,75.144997,74.125,74.357498,72.349144,146322800,Why AAPL Stock Had a MiniFlash Crash Today AA...,0.0
2,2,2020-01-06,73.447502,74.989998,73.1875,74.949997,72.925636,118387200,My cat only chews cords Such an AppleSnob,0.0
3,3,2020-01-07,74.959999,75.224998,74.370003,74.597504,72.582664,108872000,I agree with that the IndividualInvestor shou...,0.65
4,4,2020-01-08,74.290001,76.110001,74.290001,75.797501,73.750252,132079200,Nobody expects the Spanish Inquisition AAPL,0.0


 Feature Engineering

In [28]:
# Assuming sentiment scores are in the dataframe (we need to aggregate them as discussed earlier)
# For example, df['Sentiment'] should contain the aggregated sentiment scores

# Create lag features
for lag in range(1, 6):  # Create lag features for the past 5 days
    df[f'High_lag_{lag}'] = df['High'].shift(lag)
    df[f'Low_lag_{lag}'] = df['Low'].shift(lag)
    df[f'Open_lag_{lag}'] = df['Open'].shift(lag)
    df[f'Close_lag_{lag}'] = df['Close'].shift(lag)
    df[f'Volume_lag_{lag}'] = df['Volume'].shift(lag)
    df[f'Sentiment_lag_{lag}'] = df['Sentiment'].shift(lag)

# Drop rows with missing lag values
df = df.dropna()
display(df)

# Define features and target variables
features = [col for col in df.columns if 'lag' in col]
target = ['High', 'Low', 'Open', 'Close', 'Volume']

# Split the data
from sklearn.model_selection import train_test_split

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Tweet,Sentiment,...,Open_lag_4,Close_lag_4,Volume_lag_4,Sentiment_lag_4,High_lag_5,Low_lag_5,Open_lag_5,Close_lag_5,Volume_lag_5,Sentiment_lag_5
10,10,2020-01-16,78.397499,78.925003,78.022499,78.809998,76.681358,108829200,WTF MY BATTERY WAS 31 ONE SECOND AGO AND NOW I...,-0.333333,...,77.650002,77.582497,140644800.0,0.392857,77.607498,76.550003,76.809998,77.407501,170108400.0,0.500000
11,11,2020-01-17,79.067497,79.684998,78.750000,79.682503,77.530319,137816400,Apple Watch Tops Search Engine List of Best We...,1.000000,...,77.910004,79.239998,121532000.0,0.200000,78.167503,77.062500,77.650002,77.582497,140644800.0,0.392857
12,12,2020-01-21,79.297501,79.754997,79.000000,79.142502,77.004913,110843200,The BestDesigned iPhone Apps In the World Acco...,0.000000,...,79.175003,78.169998,161954400.0,0.000000,79.267502,77.787498,77.910004,79.239998,121532000.0,0.200000
13,13,2020-01-22,79.644997,79.997498,79.327499,79.425003,77.279770,101832400,Bought my at the storepretty good logo matc...,0.700000,...,77.962502,77.834999,121923600.0,0.000000,79.392502,78.042503,79.175003,78.169998,161954400.0,0.000000
14,14,2020-01-23,79.480003,79.889999,78.912498,79.807503,77.651955,104472000,Contact sync between Yosemite and iOS8 is ser...,-0.027778,...,78.397499,78.809998,108829200.0,-0.333333,78.875000,77.387497,77.962502,77.834999,121923600.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,751,2022-12-23,130.919998,132.419998,129.639999,131.860001,130.959961,63814900,Apples Eddy Cue Talks About PriceFixing On EBo...,0.000000,...,135.110001,132.369995,79592600.0,0.000000,137.649994,133.729996,136.690002,134.509995,160156900.0,0.000000
752,752,2022-12-27,131.380005,131.410004,128.720001,130.029999,129.142456,69007800,In a Bay Area Courtroom Lawyers Hit Replay on ...,0.000000,...,131.389999,132.300003,77432800.0,0.500000,135.199997,131.320007,135.110001,132.369995,79592600.0,0.000000
753,753,2022-12-28,129.669998,131.029999,125.870003,126.040001,125.179680,85438400,SteveJobs Emails Show How He Wanted To Smear C...,0.000000,...,132.979996,135.449997,85928000.0,0.000000,133.250000,129.889999,131.389999,132.300003,77432800.0,0.500000
754,754,2022-12-29,127.989998,130.479996,127.730003,129.610001,128.725311,75703700,Stop liking man,0.000000,...,134.350006,132.229996,77852100.0,0.000000,136.809998,132.750000,132.979996,135.449997,85928000.0,0.000000


Model Training

In [25]:
from sklearn.linear_model import LinearRegression

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

Model Evaluation

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Make predictions
y_pred = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')


Mean Absolute Error: 4783681.659540411
Mean Squared Error: 227027440777675.34


Prediction

In [27]:
# Assume we have new data for prediction
# Replace this with actual new data for prediction
new_data = {
    'High_lag_1': [150], 'Low_lag_1': [145], 'Open_lag_1': [148], 'Close_lag_1': [147], 'Volume_lag_1': [100000],
    'Sentiment_lag_1': [0.4], 'High_lag_2': [151], 'Low_lag_2': [146], 'Open_lag_2': [149], 'Close_lag_2': [148],
    'Volume_lag_2': [105000], 'Sentiment_lag_2': [0.3], 'High_lag_3': [152], 'Low_lag_3': [147], 'Open_lag_3': [150],
    'Close_lag_3': [149], 'Volume_lag_3': [110000], 'Sentiment_lag_3': [0.5], 'High_lag_4': [153], 'Low_lag_4': [148],
    'Open_lag_4': [151], 'Close_lag_4': [150], 'Volume_lag_4': [115000], 'Sentiment_lag_4': [0.6], 'High_lag_5': [154],
    'Low_lag_5': [149], 'Open_lag_5': [152], 'Close_lag_5': [151], 'Volume_lag_5': [120000], 'Sentiment_lag_5': [0.7]
}
new_data = pd.DataFrame(new_data)

# Predict for a new day
predicted_prices = model.predict(new_data)
print(f'Predicted High: {predicted_prices[0][0]}')
print(f'Predicted Low: {predicted_prices[0][1]}')
print(f'Predicted Open: {predicted_prices[0][2]}')
print(f'Predicted Close: {predicted_prices[0][3]}')
print(f'Predicted Volume: {predicted_prices[0][4]}')


Predicted High: 148.4559810694364
Predicted Low: 144.0944550786125
Predicted Open: 146.19709426696684
Predicted Close: 146.79132176870166
Predicted Volume: 21791468.594538018
