In [34]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn import preprocessing
from scipy.stats.mstats import winsorize
from scipy.stats import skew
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [35]:
# Load Tweets
stock = pd.read_csv("stocks2.csv",
                     lineterminator='\n')

tweets_sent = pd.read_csv("tweets_sent.csv",
                     lineterminator='\n')

news_sent = pd.read_csv("news_sent.csv",
                     lineterminator='\n')

stock['div'] = stock['div'].fillna(0)

# Keep only main 20 stocks
stock = stock.loc[stock["tic"] != "ASMLF"]
stock = stock.loc[stock["tic"] != "AVGOP"]
stock = stock.loc[stock["tic"] != "GOOGL"]
stock = stock.loc[stock["tic"] != "AZNCF"]
stock = stock.loc[stock["tic"] != "CCZ"]

# Rename columns
stock = stock.rename(columns={"datadate": "date"})
stock = stock.rename(columns={"tic": "company"})
tweets_sent = tweets_sent.rename(columns={"Polarity": "tweets_sent"})
news_sent = news_sent.rename(columns={"Polarity": "news_sent"})

# Merge dataframes and fill empties with zero
df = pd.merge(stock, tweets_sent, on=['date','company'], how = 'left').fillna(0)
df = pd.merge(df, news_sent, on=['date','company'], how = 'left').fillna(0)

df = df[['date', 'company',
         'ajexdi', 'epsmo', 'trfd',
         'cshoc', 'cshtrd', 'eps', 'div',
         'prccd', 'prchd', 'prcld', 
         'prcod', 'tweets_sent', 'news_sent']]

In [36]:
df = df.sort_values(['date', 'company']) 

df['simp_ret'] = df.groupby('company')['prccd'].pct_change()
df['ret'] = df.groupby('company')['simp_ret'].apply(lambda x: np.log(1 + x) * 100)
df['ret+1'] = df.groupby('company')['ret'].shift(-1)


# Drop the first row for each company since it will have a NaN value
df = df.groupby('company').apply(lambda x: x.dropna())

In [38]:
# Winsorize extreme values to reduce skewness 
for v in ['ajexdi','eps', 'prccd', 'prchd', 'prcld', 'prcod', 'cshoc', 'cshtrd']:
    df[v] = winsorize(df[v],limits=[.00,.05])

In [28]:
# Define the columns to be scaled
cols_to_scale = ['ajexdi', 'epsmo', 'trfd', 'cshoc', 'cshtrd', 'eps', 'prccd', 'prchd', 'prcld', 'prcod', 'div']

scaler = MinMaxScaler(feature_range=(-1, 1))
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

In [10]:
# Save as CSV
df.to_csv("ml_df.csv", index = False)