# NLP Preprocessing

In [12]:
import numpy as np
import pandas as pd
import nltk
import data_cleaning as dc
import stock_pricing as sp
import importlib

from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Force reload data_cleaning and stock_pricing
importlib.reload(dc)
importlib.reload(sp)

# Prepare the NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()

# Define desired database parameters. Set to -1 to load all data.
database_size = 1000

# Optionally force data to be regenerated
force_data_regeneration = False

try:
    # If force_data_regeneration is set, force an exception to reload the data
    if force_data_regeneration:
        print('Forcing data regeneration.')
        raise ValueError('Forcing data regeneration.')
    
    # Load the preprocessed data if it exists
    df = pd.read_csv('./stockerbot-export-preprocessed.csv')
    
    # If dataframe is not expected size, reload the data
    if database_size != -1 & len(df) > database_size:
        df = df.sample(n=database_size)
    elif database_size != -1 & len(df) < database_size:    
        print('Preprocessed file is not the expected size. Reloading data.')
        raise ValueError('Preprocessed file is not the expected size.')
    
    print('Preprocessed file found and loaded.')
except (FileNotFoundError, ValueError):
    # Load dataset with stock data
    df = sp.preprocess_nasdaq_df(database_size)

    # Add sentiment column with TextBlob if it doesn't exist
    df['tweet_polarity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)
    df['tweet_subjectivity'] = df['text'].apply(lambda tweet: TextBlob(tweet).sentiment.subjectivity)

    # Apply preprocessing to the 'tweet' column
    df['preprocessed_tweet'] = df['text'].apply(lambda tweet: dc.preprocess_tweet(tweet, lemmatizer))
        
    # Save the preprocessed data
    df.to_csv('./stockerbot-export-preprocessed.csv', index=False)
    print('File preprocessing completed and saved.')

# Display the preprocessed dataframe
pd.set_option('display.max_colwidth', None)
display(df.head(20))

[nltk_data] Downloading package wordnet to /Users/seby/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/seby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Forcing data regeneration.
                       id  \
3123  1017378035509669900   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified,month,day,year,Price Day Before Tweet,Price Day of Tweet,Price Day After Tweet,tweet_polarity,tweet_subjectivity,preprocessed_tweet
3123,1017378035509669900,"Novice Traders trade 5 to 10 times too big. They are taking 5 to 10 percent risk on a trade they should be taking… https://t.co/ya7IPSVtBT,Thu Jul 12 12:00:00 +0000 2018,traderead,MCHP,Microchip Technology Incorporated,https://twitter.com/i/web/status/1017378035509669890,False\n1017378044212924400,Short sale volume (not short interest) for $CGNX at 2018-07-11 is 55%. https://t.co/jmH5MzTzhq $HON 34% $ROP 47% $XLK 74% $NOVT 51%,Thu Jul 12 12:00:02 +0000 2018,shortvolumes,ROP,Roper Technologies,http://shortvolumes.com/?t=CGNX,False\n1017378075947028500,BidaskClub top 50 #StronglyBought $EIX $SODA $GDS $TPL $AMGN $NOW $CCI $DLR $WELL $LLY $CRM $INFY $K $SNE $SO $PSA… https://t.co/e4YOIDYYL1,Thu Jul 12 12:00:09 +0000 2018,BACTop50Cies,IP,International Paper Company,https://twitter.com/i/web/status/1017378075947028485,False\n1017378386589683700,Downgrades: $AFG $AVGO $CJ $CNFR $BK $MMM $EGN $ITW $SPN $SLCA $CMCSA $PH $SM $BKH $EXR $FUN $GLNG,Thu Jul 12 12:01:23 +0000 2018,TopStockAlerts1,SLCA,U.S. Silica Holdings,,False\n1017378500028792800,$hum Morgan with $362 target,Thu Jul 12 12:01:51 +0000 2018,jeffwills08,HUM,Humana Inc.,,False\n1017378577837449200,EPS for McKesson $MCK Expected At $2.71; Twitter $TWTR SI Decreased By 12.34% https://t.co/nxIovYUYJd,Thu Jul 12 12:02:09 +0000 2018,The_CasualSmart,MCK,McKesson Corporation,https://cardinalweekly.com/eps-for-mckesson-mck-expected-at-2-71-twitter-twtr-si-decreased-by-12-34/,False\n1017378607864442900,Pggm Investments Decreased Kimco Rlty Com $KIM Holding; Bank Of New York Co Has 0.87 Sentiment https://t.co/wNVgp0jUM6,Thu Jul 12 12:02:16 +0000 2018,The_CasualSmart,KIM,Kimco Realty Corporation,https://cardinalweekly.com/pggm-investments-decreased-kimco-rlty-com-kim-holding-bank-of-new-york-co-has-0-87-sentiment/,False\n1017378724193321000,Jacobs Secures Key Role on Etihad Rail Project in UAE $JEC - https://t.co/scmUpVGcOW,Thu Jul 12 12:02:44 +0000 2018,MarketGlance,JEC,Jacobs Engineering Group Inc.,https://prn.to/2zy9FZN,False\n1017378797799297000,EPS for MGM Resorts International $MGM Expected At $0.27; Cummins $CMI's Sentiment Is 0.9 https://t.co/qweplfusNL,Thu Jul 12 12:03:02 +0000 2018,The_CasualSmart,CMI,Cummins Inc.,https://cardinalweekly.com/eps-for-mgm-resorts-international-mgm-expected-at-0-27-cummins-cmis-sentiment-is-0-9/,False\n1017378812001210400,Bucking Trump Health Insurers Expand Obamacare Footprints via @forbes https://t.co/4L6MYrYBhV $CNC $ANTM #MorningJoe,Thu Jul 12 12:03:05 +0000 2018,brucejapsen,CNC,Centene Corporation,http://www.forbes.com/sites/brucejapsen/2018/07/12/bucking-trump-health-insurers-expand-obamacare-footprints/#4cbef1855fc0,False\n1017378817965510700,Thornburg Investment Management INC Decreased Its Cme Group INC $CME Position as Stock Value Declined https://t.co/ZIWHGHJcVw,Thu Jul 12 12:03:06 +0000 2018,The_CasualSmart,CME,CME Group Inc.,https://thecasualsmart.com/2018/07/12/thornburg-investment-management-inc-decreased-its-cme-group-inc-cme-position-as-stock-value-declined/,False\n1017378830472884200,Analysts Anticipate Mid-America Apartment Communities Inc $MAA to Announce $1.48 Earnings Per Share https://t.co/Yn9MvpjRaY,Thu Jul 12 12:03:09 +0000 2018,EnterpriseLeade,MAA,Mid-America Apartment Communities,http://theenterpriseleader.com/?p=981655,False\n1017379091945869300,Thornburg Investment Management INC Cut Cme Group INC $CME Holding as Stock Declined https://t.co/FkYNlkGs7Q,Thu Jul 12 12:04:12 +0000 2018,The_CasualSmart,CME,CME Group Inc.,https://floridarecorder.com/2018/07/12/thornburg-investment-management-inc-cut-cme-group-inc-cme-holding-as-stock-declined/,False\n1017379188691619800,RT @robles_jdaniel: @TrainWithBain @ksenapathy @MonsantoCo Indeed. Trial of the century. #RoundupTrial #DeWayneJohnson #GlyphosateGate #Wo…,Thu Jul 12 12:04:35 +0000 2018,JeffK_BanGMOs,MON,Monsanto Company,,False\n1017379200158838800,Westwood Management Has Lifted Laboratory Of America $LH Position; Shorts at Shineco $TYHT Raised By 8.28% https://t.co/BfrJTXLtTd,Thu Jul 12 12:04:37 +0000 2018,beyond_ninety,LH,Laboratory Corporation of America Holdings,https://tokoyuri.com/2018/07/12/westwood-management-has-lifted-laboratory-of-america-lh-position-shorts-at-shineco-tyht-raised-by-8-28/,False\n1017379378362290200,RT @brucejapsen: Bucking Trump Health Insurers Expand Obamacare Footprints via @forbes https://t.co/4L6MYrYBhV $CNC $ANTM #MorningJoe,Thu Jul 12 12:05:20 +0000 2018,SusanS96965737,CNC,Centene Corporation,http://www.forbes.com/sites/brucejapsen/2018/07/12/bucking-trump-health-insurers-expand-obamacare-footprints/#4cbef1855fc0,False\n1017379740330709000,As Travelers Companies $TRV Stock Declined Shareholder First Wilshire Securities Management Trimmed by $301254 It… https://t.co/gLzFo8Gf97,Thu Jul 12 12:06:46 +0000 2018,usindexlive,EL,The EstíëŒ©e Lauder Companies Inc.,https://twitter.com/i/web/status/1017379740330708992,False\n1017379820056006700,Extreme Networks $EXTR Stake Held by Cramer Rosenthal Mcglynn Llc; Fny Managed Accounts Has Increased Its Monsanto… https://t.co/VbSC5mBUfW,Thu Jul 12 12:07:05 +0000 2018,The_CasualSmart,MON,Monsanto Company,https://twitter.com/i/web/status/1017379820056006656,False\n1017380010674540500,Bp Plc Upped Humana $HUM Position; UBM PLC ORDINARY SHARES $UBMOF Shorts Up By 2.5% https://t.co/gbYn1or7cX,Thu Jul 12 12:07:51 +0000 2018,The_CasualSmart,HUM,Humana Inc.,https://cardinalweekly.com/bp-plc-upped-humana-hum-position-ubm-plc-ordinary-shares-ubmof-shorts-up-by-2-5/,False\n1017380126756139000,FY2019 EPS Estimates for DISCOVERY COMMUNICATIONS INC. Common Stock Cut by Analyst $DISCA https://t.co/791WdGkmyz,Thu Jul 12 12:08:18 +0000 2018,TheMarketsDaily,DISCA,Discovery,http://zpr.io/6aLTf,False\n1017380212940660700,Midas Management Position in Amerisourcebergen Corp. $ABC Lowered by $666500 as Shares Declined; Foyston Gordon &amp;… https://t.co/BUvzZ0OqnI,Thu Jul 12 12:08:39 +0000 2018,The_CasualSmart,ABC,AmerisourceBergen Corporation,https://twitter.com/i/web/status/1017380212940660736,False\n1017380226509176800,Laboratory Amer Hldgs $LH Stock Rose While Stone Run Capital Upped Holding by $338100; Townebank Portsmouth Va… https://t.co/PwjEyEQSpR,Thu Jul 12 12:08:42 +0000 2018,usindexlive,LH,Laboratory Corporation of America Holdings,https://twitter.com/i/web/status/1017380226509176837,False\n1017380242804133900,As Emerson Elec Co $EMR Market Valuation Declined Stone Run Capital Lowered Its Position; As American Elec Pwr… https://t.co/syChSnTQET,Thu Jul 12 12:08:46 +0000 2018,The_CasualSmart,AEP,American Electric Power Company,https://twitter.com/i/web/status/1017380242804133889,False\n1017380281974652900,RT @robles_jdaniel: @TrainWithBain @ksenapathy @MonsantoCo Indeed. Trial of the century. #RoundupTrial #DeWayneJohnson #GlyphosateGate #Wo…,Thu Jul 12 12:08:55 +0000 2018,Resourceful1942,MON,Monsanto Company,,False\n1017380361523859500,$CY Cypress to Announce Second Quarter 2018 Results From our Stock News Alerts App,Thu Jul 12 12:09:14 +0000 2018,Allstocknews,CY,Cypress Semiconductor Corporation,,False\n1017380418411167700,OceanFirst Financial Corp. $OCFC EPS Estimated At $0.46; Shorts at Udr $UDR Lowered By 4.92% https://t.co/7A7g922ay3,Thu Jul 12 12:09:28 +0000 2018,The_CasualSmart,UDR,UDR,https://cardinalweekly.com/oceanfirst-financial-corp-ocfc-eps-estimated-at-0-46-shorts-at-udr-udr-lowered-by-4-92/,False\n1017380620358611000,Speece Thorson Capital Group Upped O Reilly Automotive $ORLY Position By $1.42 Million; American Electric Power Co… https://t.co/TvNOHEcfL2,Thu Jul 12 12:10:16 +0000 2018,The_CasualSmart,AEP,American Electric Power Company,https://twitter.com/i/web/status/1017380620358610945,False\n1017380662461026300,Dr Pepper Snapple Group $DPS Shareholder Raymond James &amp; Associates Has Cut Stake as Valuation Rose; Montag &amp; Caldw… https://t.co/kmKlOBkrmD,Thu Jul 12 12:10:26 +0000 2018,The_CasualSmart,DPS,Dr Pepper Snapple Group,https://twitter.com/i/web/status/1017380662461026305,False\n1017380681717026800,Nextera Energy $NEE Holder Shelter Mutual Insurance Co Has Lifted Stake by $668300 as Market Value Rose; Rafferty… https://t.co/kfvRUauoYy,Thu Jul 12 12:10:31 +0000 2018,The_CasualSmart,NEE,NextEra Energy,https://twitter.com/i/web/status/1017380681717026817,False\n1017380841863905300,As The Kroger Co. $KR Stock Declined Coho Partners LTD Has Increased Stake; Greenbrier Partners Capital Management… https://t.co/ffJxp9DNuK,Thu Jul 12 12:11:09 +0000 2018,The_CasualSmart,PHM,PulteGroup,https://twitter.com/i/web/status/1017380841863905281,False\n1017380974374674400,New Vernon Capital Holdings Ii Maintains Holding in Hdfc Bank LTD $HDB; Meeder Asset Management Maintains Position… https://t.co/vIKPXldZ34,Thu Jul 12 12:11:40 +0000 2018,The_CasualSmart,CNP,CenterPoint Energy,https://twitter.com/i/web/status/1017380974374674432,False\n1017381050069209100,Pinnacle Foods Inc. $PF Analysts See $0.56 EPS; Last Week Envision Healthcare $EVHC Analysts https://t.co/ELE2lG4pE4,Thu Jul 12 12:11:59 +0000 2018,The_CasualSmart,EVHC,Envision Healthcare Corporation,https://cardinalweekly.com/pinnacle-foods-inc-pf-analysts-see-0-56-eps-last-week-envision-healthcare-evhc-analysts/,False\n1017381131229032400,Hormel Foods Corp Plans Quarterly Dividend of $0.19 $HRL https://t.co/unWDPziTvM,Thu Jul 12 12:12:18 +0000 2018,TickerReport,HRL,Hormel Foods Corporation,http://tickerreport.com/?p=3631159,False\n1017381180491161600,Etrade Capital Management Boosted Its Las Vegas Sands $LVS Stake by $317441; Valuation Rose; As Xilinx $XLNX Share… https://t.co/8GxsOOqNfA,Thu Jul 12 12:12:30 +0000 2018,The_CasualSmart,XLNX,Xilinx,https://twitter.com/i/web/status/1017381180491161600,False\n1017381341875318800,Now that the stage is set for catastrophic collapse of the american economy led by @stevenmnuchin1's megalith at… https://t.co/u88jImVoIi,Thu Jul 12 12:13:08 +0000 2018,RestonTV,BXP,Boston Properties,https://twitter.com/i/web/status/1017381341875318784,False\n1017381407902052400,Viacom Inc. $VIAB EPS Estimated At $1.06 https://t.co/xZL2vG0sJQ,Thu Jul 12 12:13:24 +0000 2018,The_CasualSmart,VIAB,Viacom,https://thecasualsmart.com/2018/07/12/viacom-inc-viab-eps-estimated-at-1-06-2/,False\n1017381514005409800,Analysts See $1.06 EPS for Viacom Inc. $VIAB - https://t.co/8ohIqmmbqc,Thu Jul 12 12:13:49 +0000 2018,whatsonthorold2,VIAB,Viacom,https://www.whatsonthorold.com/2018/07/12/analysts-see-1-06-eps-for-viacom-inc-viab/,False\n1017381532980400100,RT @CryptoNormand: My Ranking top 10 - moon end of years 2018 1- @s_protocol 2- @PolicyPalNET 3- @TE_FOOD 4- @APEXnetworkCPX 5- @TomoCha…,Thu Jul 12 12:13:54 +0000 2018,CryptoNormand,EFX,Equifax Inc.,,False\n1017381597060968400,Viacom Inc. $VIAB EPS Estimated At $1.06 - https://t.co/qmynZx7PRk,Thu Jul 12 12:14:09 +0000 2018,mmahotstuff1,VIAB,Viacom,https://www.mmahotstuff.com/2018/07/12/viacom-inc-viab-eps-estimated-at-1-06.html,False\n1017381658771841000,EPS for Viacom Inc. $VIAB Expected At $1.06 https://t.co/Be0781LGr3,Thu Jul 12 12:14:24 +0000 2018,The_CasualSmart,VIAB,Viacom,https://floridarecorder.com/2018/07/12/eps-for-viacom-inc-viab-expected-at-1-06/,False\n1017381673846083600,$DATA $ARE $ONE $EXAM $ALL $MEN $AGO $YUM! $I $AM $TOO $HOT $BWA!,Thu Jul 12 12:14:27 +0000 2018,stockbard,BWA,BorgWarner Inc.,,False\n1017381681983041500,Analysts See $1.70 EPS for Praxair Inc. $PX; Marathon Petroleum $MPC Has 0.9 Sentiment https://t.co/DN5TO4bMlj,Thu Jul 12 12:14:29 +0000 2018,The_CasualSmart,MPC,Marathon Petroleum Corporation,https://cardinalweekly.com/analysts-see-1-70-eps-for-praxair-inc-px-marathon-petroleum-mpc-has-0-9-sentiment/,False\n1017381695979262000,$CVIA $EMES $HCLP $SLCA $SND $WTTR https://t.co/muoW2uKsvg,Thu Jul 12 12:14:33 +0000 2018,SeekingAlpha,SLCA,U.S. Silica Holdings,https://seekingalpha.com/article/4186846-select-energy-q2-shaping-blowout?source=feed_f,False\n1017381774484271100,Viacom Inc. $VIAB Analysts See $1.06 EPS https://t.co/vikbcoZQ0x,Thu Jul 12 12:14:51 +0000 2018,reurope_stock,VIAB,Viacom,https://reurope.com/2018/07/12/viacom-inc-viab-analysts-see-1-06-eps/,False\n1017381840540307500,Grace &amp; White Has Increased Kimco Rlty Com $KIM Holding By $2.96 Million; Ulta Beauty $ULTA Shorts Decreased By 6.8… https://t.co/9sRvUi0l0s,Thu Jul 12 12:15:07 +0000 2018,The_CasualSmart,KIM,Kimco Realty Corporation,https://twitter.com/i/web/status/1017381840540307458,False\n1017381846806655000,Viacom Inc. $VIAB Analysts See $1.06 EPS - https://t.co/Zc8xkYWFgU,Thu Jul 12 12:15:08 +0000 2018,bibeypost_stock,VIAB,Viacom,https://www.bibeypost.com/viacom-inc-viab-analysts-see-1-06-eps-2/,False\n1017381861323046900,Gilead Sciences $Call $GILD Shareholder Continental Advisors Lowered Its Holding by $1.28 Million as Valuation Decl… https://t.co/tg4uCU6bYJ,Thu Jul 12 12:15:12 +0000 2018,The_CasualSmart,FRT,Federal Realty Investment Trust,https://twitter.com/i/web/status/1017381861323046915,False\n1017382024578064400,RT @brucejapsen: Bucking Trump Health Insurers Expand Obamacare Footprints via @forbes https://t.co/4L6MYrYBhV $CNC $ANTM #MorningJoe,Thu Jul 12 12:15:51 +0000 2018,ADCBenefits,CNC,Centene Corporation,http://www.forbes.com/sites/brucejapsen/2018/07/12/bucking-trump-health-insurers-expand-obamacare-footprints/#4cbef1855fc0,False\n1017382223639695400,Rutabaga Capital Management Has Cut Par Pacific Holdings $PARR Holding; Lyondellbasell Industries NV $LYB Has 0.89… https://t.co/qwb5mezpZK,Thu Jul 12 12:16:38 +0000 2018,The_CasualSmart,LYB,LyondellBasell Industries N.V.,https://twitter.com/i/web/status/1017382223639695360,False\n1017382226881843200,As Medtronic Plc. $MDT Share Value Rose Shareholder J-P Marvel Investment Advisors Decreased Its Position; First C… https://t.co/aeA9HN7e6H,Thu Jul 12 12:16:39 +0000 2018,The_CasualSmart,CAH,Cardinal Health,https://twitter.com/i/web/status/1017382226881843200,False\n1017382250298691600,Lumentum $LITE Downgraded by ValuEngine https://t.co/xTcQNM6OrF,Thu Jul 12 12:16:45 +0000 2018,ZolmaxNews,LITE,Lumentum Holdings Inc.,http://zolmax.com/?p=2377426,False\n1017382273388294100,Lyondellbasell Industries Nv $LYB Holder Grimes &amp; Company INC Has Cut Its Stake by $8.26 Million https://t.co/UFgLiGK5Z8,Thu Jul 12 12:16:50 +0000 2018,The_CasualSmart,LYB,LyondellBasell Industries N.V.,https://thecasualsmart.com/2018/07/12/lyondellbasell-industries-nv-lyb-holder-grimes-company-inc-has-cut-its-stake-by-8-26-million/,False\n1017382323648512000,Analysts Anticipate $QQEW Will Reach $70 $DISH $KLAC $XRAY #etfs https://t.co/Cvv9c1X494,Thu Jul 12 12:17:02 +0000 2018,bnkinvest,XRAY,DENTSPLY SIRONA Inc.,http://dlvr.it/QbJk2p,False\n1017382326727086100,Implied $VLUE Analyst Target Price: $98 $WDC $XRX $LEN #etfs https://t.co/6HUDJyqkSg,Thu Jul 12 12:17:03 +0000 2018,bnkinvest,XRX,Xerox Corporation,http://dlvr.it/QbJk2s,False\n1017382332905353200,Analysts Anticipate $IWD Will Reach $139 $PWR $AA $NUAN #etfs https://t.co/U74ZWKE9jJ,Thu Jul 12 12:17:04 +0000 2018,bnkinvest,NUAN,Nuance Communications,http://dlvr.it/QbJk31,False\n1017382334390128600,Analysts Forecast 11% Upside For The Holdings of $DGRW $ORA $MCHP $JBL #etfs https://t.co/TkgCERt9te,Thu Jul 12 12:17:05 +0000 2018,bnkinvest,MCHP,Microchip Technology Incorporated,http://dlvr.it/QbJk34,False\n1017382336143372300,Analysts Anticipate 11% Upside For The Holdings of $PYZ $PAH $FMC $SXT #etfs https://t.co/G7ATieTJn2,Thu Jul 12 12:17:05 +0000 2018,bnkinvest,PAH,Platform Specialty Products Corporation,http://dlvr.it/QbJk39,False\n1017382397120254000,Grimes &amp; Company INC Has Lowered Its Lyondellbasell Industries Nv $LYB Stake by $8.26 Million; Stock Rose - https://t.co/KrUe8L0K7r,Thu Jul 12 12:17:20 +0000 2018,whatsonthorold2,LYB,LyondellBasell Industries N.V.,https://www.whatsonthorold.com/2018/07/12/grimes-stock-rose/,False\n1017382510240596000,Grimes &amp; Company INC Stake in Lyondellbasell Industries Nv $LYB Decreased by $8.26 Million - https://t.co/UaY92nLNHb,Thu Jul 12 12:17:47 +0000 2018,mmahotstuff1,LYB,LyondellBasell Industries N.V.,https://www.mmahotstuff.com/2018/07/12/grimes-company-inc-stake-in-lyondellbasell-industries-nv-lyb-decreased-by-8-26-million.html,False\n1017382556411617300,Lyondellbasell Industries Nv $LYB Shareholder Grimes &amp; Company INC Decreased Its Stake as Shares Rose https://t.co/ChD8n7MJnU,Thu Jul 12 12:17:58 +0000 2018,The_CasualSmart,LYB,LyondellBasell Industries N.V.,https://floridarecorder.com/2018/07/12/lyondellbasell-industries-nv-lyb-shareholder-grimes-company-inc-decreased-its-stake-as-shares-rose/,False\n1017382653950136300,Lyondellbasell Industries Nv $LYB Holder Grimes &amp; Company INC Has Cut Its Stake by $8.26 Million as Shares Rose https://t.co/W8OndtJysn,Thu Jul 12 12:18:21 +0000 2018,reurope_stock,LYB,LyondellBasell Industries N.V.,https://reurope.com/2018/07/12/lyondellbasell-industries-nv-lyb-holder-grimes-company-inc-has-cut-its-stake-by-8-26-million-as-shares-rose/,False\n1017382659633369100,Hibbett Sports $HIBB Upgraded to “Hold” at ValuEngine https://t.co/WiYpZziPpZ,Thu Jul 12 12:18:22 +0000 2018,WeekHerald,HIBB,Hibbett Sports,http://weekherald.com/?p=2985529,False\n1017382672820260900,Sumitomo Life Insurance Company Decreased Helmerich &amp; Payne Com $HP Position By $625548; Consolidated Edison Has 0… https://t.co/X4P4FM5Dl6,Thu Jul 12 12:18:25 +0000 2018,The_CasualSmart,HP,Helmerich & Payne,https://twitter.com/i/web/status/1017382672820260864,False\n1017382707184197600,Mycio Wealth Partners Lifted Apple $AAPL Holding; Simon Property Group $SPG Has 0.93 Sentiment https://t.co/qPEYVYHGeY,Thu Jul 12 12:18:34 +0000 2018,The_CasualSmart,SPG,Simon Property Group,https://cardinalweekly.com/mycio-wealth-partners-lifted-apple-aapl-holding-simon-property-group-spg-has-0-93-sentiment/,False\n1017382714448732200,Cvs Health $CVS Share Price Declined While Trust Co Of Oklahoma Decreased Holding; Cwm Cut Its Stake in Price T Row… https://t.co/cjznOC5ELT,Thu Jul 12 12:18:35 +0000 2018,The_CasualSmart,TROW,T. Rowe Price Group,https://twitter.com/i/web/status/1017382714448732160,False\n1017382725534306300,Grimes &amp; Company INC Has Decreased Lyondellbasell Industries Nv $LYB Stake by $8.26 Million - https://t.co/JvkglGXXkw,Thu Jul 12 12:18:38 +0000 2018,bibeypost_stock,LYB,LyondellBasell Industries N.V.,https://www.bibeypost.com/grimes-company-inc-has-decreased-lyondellbasell-industries-nv-lyb-stake-by-8-26-million/,False\n1017382775303925800,Cullen Frost Bankers Has Increased Stake in Edwards Lifesciences Com $EW by $5.98 Million; Comcast New $CMCSA Holde… https://t.co/A04nXFy6bO,Thu Jul 12 12:18:50 +0000 2018,The_CasualSmart,EW,Edwards Lifesciences Corporation,https://twitter.com/i/web/status/1017382775303925761,False\n1017382903691563000,Prochemie Gmbh $LYB Shareholder Kepos Capital LP Has Lifted Its Holding by $778365; General Electric Co Com $GE Ho… https://t.co/alYdoBtLFt,Thu Jul 12 12:19:20 +0000 2018,The_CasualSmart,LYB,LyondellBasell Industries N.V.,https://twitter.com/i/web/status/1017382903691563008,False\n1017382907038654500,Discovery Inc Series C $DISCK Upgraded by ValuEngine to “Hold” https://t.co/urybbDSRMe,Thu Jul 12 12:19:21 +0000 2018,dakotafinancial,DISCK,Discovery,http://dakotafinancialnews.com/?p=313250,False\n1017382992443043800,7/12 200D MA Watch List: $AVEO $KEY $DUST $TSLA $AMTD $AUY $LVS $SPWR $UTX $KO $EWW $ENB $AA $RIO $FAS $GRPN $COF… https://t.co/bZH4Hl75Q9,Thu Jul 12 12:19:42 +0000 2018,TradeAcademyCo,AABA,Altaba Inc.,https://twitter.com/i/web/status/1017382992443043841,False\n1017383129466724400,7/12 50D MA Watch List: $THO $FEYE $OKTA $GE $ECA $MOS $SWKS $THC $DKS $QCOM $STX $WBA $XLE $FAS $BBY $TXN $DOCU… https://t.co/3kjTVAUdaD,Thu Jul 12 12:20:14 +0000 2018,TradeAcademyCo,MOS,The Mosaic Company,https://twitter.com/i/web/status/1017383129466724353,False\n1017383257200001000,RT @robles_jdaniel: @TrainWithBain @ksenapathy @MonsantoCo Indeed. Trial of the century. #RoundupTrial #DeWayneJohnson #GlyphosateGate #Wo…,Thu Jul 12 12:20:45 +0000 2018,Kenn_QBE,MON,Monsanto Company,,False\n1017383364205170700,7/12 High IV Watch List: $SNAP $GT $UA $FOSL $W $AKRX $GPRO $CAR $UAA $BIIB $WTW $TTWO $GRUB $XRX $FTR $MNST $SKX… https://t.co/PvANd2HaJm,Thu Jul 12 12:21:10 +0000 2018,TradeAcademyCo,XRX,Xerox Corporation,https://twitter.com/i/web/status/1017383364205170688,False\n1017383453434761200,7/12 Low IV Watch List: $PTLA $DVMT $VMW $BBY $GBT $WMT $NTAP $HRB $FL $LOW $TXMD $CSCO $MDT $WPM $HD $URBN $EWW… https://t.co/R1mtLsVF2T,Thu Jul 12 12:21:32 +0000 2018,TradeAcademyCo,DLTR,Dollar Tree,https://twitter.com/i/web/status/1017383453434761217,False\n1017383505226018800,Morning Most Tweeted Big Caps check out Gambiste Top 10: $CA $IQ $AMD $FAST $NDAQ $MCHP $AAL $MNST $COST $QQQ https://t.co/lYp5LtitVR,Thu Jul 12 12:21:44 +0000 2018,GambisteFinance,MCHP,Microchip Technology Incorporated,http://gambiste.com/index.php/current-month-stock-data/,False\n1017383660633411600,Beneteau SA: Historical volume for Bénéteau stock. $BEN https://t.co/0BwZcbcSal https://t.co/AKojYv5uEF,Thu Jul 12 12:22:21 +0000 2018,MScreener,BEN,Franklin Resources,http://www.4-traders.com//BENETEAU-SA-4622/,False\n1017383707647397900,Bnp Paribas Investment Partners Sa Has Cut Its Stake in Simon Ppty Group INC New $SPG by $25.07 Million https://t.co/N0nMnA6B0D,Thu Jul 12 12:22:32 +0000 2018,The_CasualSmart,SPG,Simon Property Group,https://thecasualsmart.com/2018/07/12/bnp-paribas-investment-partners-sa-has-cut-its-stake-in-simon-ppty-group-inc-new-spg-by-25-07-million/,False\n1017383864745029600,Simon Ppty Group INC New $SPG Stock Rose While Bnp Paribas Investment Partners Sa Has Trimmed Stake by $25.07 Milli… https://t.co/H9lu9vKiEy,Thu Jul 12 12:23:10 +0000 2018,whatsonthorold2,SPG,Simon Property Group,https://twitter.com/i/web/status/1017383864745029632,False\n1017384030541701100,Bnp Paribas Investment Partners Sa Cut Its Simon Ppty Group INC New $SPG Position as Stock Value Rose - https://t.co/s2lFuObZqd,Thu Jul 12 12:23:49 +0000 2018,mmahotstuff1,SPG,Simon Property Group,https://www.mmahotstuff.com/2018/07/12/bnp-paribas-investment-partners-sa-cut-its-simon-ppty-group-inc-new-spg-position-as-stock-value-rose.html,False\n1017384080646791200,Simon Ppty Group INC New $SPG Holder Bnp Paribas Investment Partners Sa Has Decreased Its Holding as Share Price Ro… https://t.co/R2MTXgOpU2,Thu Jul 12 12:24:01 +0000 2018,The_CasualSmart,SPG,Simon Property Group,https://twitter.com/i/web/status/1017384080646791168,False\n1017384263417761800,Bnp Paribas Investment Partners Sa Has Trimmed Its Simon Ppty Group INC New $SPG Stake by $25.07 Million as Market Valuation Rose ...,Thu Jul 12 12:24:45 +0000 2018,reurope_stock,SPG,Simon Property Group,,False\n1017384359823888400,As Simon Ppty Group INC New $SPG Market Valuation Rose Holder Bnp Paribas Investment Partners Sa Has Trimmed Its H… https://t.co/WkymyFhQSG,Thu Jul 12 12:25:08 +0000 2018,bibeypost_stock,SPG,Simon Property Group,https://twitter.com/i/web/status/1017384359823888390,False\n1017385141235322900,RT @proactive_UK: $ABC Abcam forecasts double digit revenue growth for full-year as it sees product growth across the board https://t.co/1a…,Thu Jul 12 12:28:14 +0000 2018,handskitech,ABC,AmerisourceBergen Corporation,,False\n1017385170943598600,$AAP pt raised to $154 from $133 at RBC Capital analyst Scot Ciccarelli kept his Outperform rating. deep dive analy… https://t.co/TWytNguVgs,Thu Jul 12 12:28:21 +0000 2018,psk2329,AAP,Advance Auto Parts,https://twitter.com/i/web/status/1017385170943598592,False\n1017385626566524900,SunCoke Energy Partners L.P. $SXCP EPS Estimated At $0.32; Hershey Co $HSY Has 0.8 Sentiment https://t.co/T9YpMB1076,Thu Jul 12 12:30:10 +0000 2018,The_CasualSmart,HSY,The Hershey Company,https://cardinalweekly.com/suncoke-energy-partners-l-p-sxcp-eps-estimated-at-0-32-hershey-co-hsy-has-0-8-sentiment/,False\n1017386221033537500,$HUM technical alerts: Stochastic Sell Signal New 52 Week High Expansion Breakout Slingshot Bul... https://t.co/vt3vAu3Yc4,Thu Jul 12 12:32:31 +0000 2018,SwingTradeBot,HUM,Humana Inc.,https://swingtradebot.com/equities/HUM,False\n1017386371219116000,3 Strong Buy Semiconductor Stocks to Consider $MLNX $INTC $MCHP Also $AAPL $MSFT $GOOGL https://t.co/74UwdyWifO,Thu Jul 12 12:33:07 +0000 2018,weijgenberger,MCHP,Microchip Technology Incorporated,https://goo.gl/qhbrwx,False\n1017386382988398600,Noble Energy $NBL Research Coverage Started at Morgan Stanley https://t.co/WXi62KRA5G,Thu Jul 12 12:33:10 +0000 2018,ledgerzette,NBL,Noble Energy,http://ledgergazette.com/?p=2821590,False\n1017386629823123500,The system being tested in Australia uses cameras to detect if a driver is holding their phone."" $MSI https://t.co/yS0PTR7zNr",2018-07-12 12:34:09,WaltBTIG,MSI,Motorola Solutions,https://twitter.com/cultofmac/status/1017386164909756416,True,7,12,2018,110.690796,111.104782,111.104782,0.104429,0.517211,"[novice, trader, trade, 5, 10, time, big, taking, 5, 10, percent, risk, trade, taking…, https://t.co/ya7IPSVtBT,Thu, jul, 12, 2018, traderead, mchp, microchip, technology, incorporated, https://twitter.com/i/web/status/1017378035509669890,False, 1017378044212924400, short, sale, volume, short, interest, $CGNX, 55, https://t.co/jmH5MzTzhq, $HON, 34, $ROP, 47, $XLK, 74, $NOVT, 51, thu, jul, 12, 2018, shortvolumes, rop, roper, technology, http://shortvolumes.com/?t=CGNX,False, 1017378075947028500, bidaskclub, top, 50, stronglybought, $EIX, $SODA, $GDS, $TPL, $AMGN, $NOW, $CCI, $DLR, $WELL, $LLY, $CRM, $INFY, $K, $SNE, $SO, https://t.co/e4YOIDYYL1,Thu, jul, 12, 2018, bactop50cies, ip, international, paper, company, https://twitter.com/i/web/status/1017378075947028485,False, 1017378386589683700, downgrade, $AFG, $AVGO, $CJ, $CNFR, $BK, $MMM, $EGN, $ITW, $SPN, $SLCA, $CMCSA, $PH, $SM, $EXR, $FUN, $GLNG, thu, jul, ...]"


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# TF-IDF vectorization for the 'preprocessed_tweet' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(df['preprocessed_tweet'].astype('U'))  # Convert to Unicode

# One-hot encoding for categorical variables
onehot_encoder = OneHotEncoder()
onehot_features_source = onehot_encoder.fit_transform(df[['source']])
onehot_features_symbols = onehot_encoder.fit_transform(df[['symbols']])

# Scaling numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tweet_polarity', 'tweet_subjectivity']])

## Testing/Training Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.sparse import hstack

# Combine all features into a single matrix
X = hstack([tfidf_features, onehot_features_source, onehot_features_symbols, scaled_features])

# The target variable
y = df['Price Day After Tweet'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Train random forest regression model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train)

# Train ridge regression model
# Can potentially adjust alpha, maybe try different values
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Train lasso regression model
# Can potentially adjust alpha, maybe try different values
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

In [None]:
# Evaluate linear regression model
linear_reg_pred = linear_reg_model.predict(X_test)
linear_reg_mae = mean_absolute_error(y_test, linear_reg_pred)
linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)
linear_reg_rmse = mean_squared_error(y_test, linear_reg_pred, squared=False)

# Evaluate random forest regression model
random_forest_pred = random_forest_model.predict(X_test)
random_forest_mae = mean_absolute_error(y_test, random_forest_pred)
random_forest_mse = mean_squared_error(y_test, random_forest_pred)
random_forest_rmse = mean_squared_error(y_test, random_forest_pred, squared=False)

# Evaluate ridge regression model
ridge_pred = ridge_model.predict(X_test)
ridge_mae = mean_absolute_error(y_test, ridge_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_rmse = mean_squared_error(y_test, ridge_pred, squared=False)

# Evaluate lasso regression model
lasso_pred = lasso_model.predict(X_test)
lasso_mae = mean_absolute_error(y_test, lasso_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_rmse = mean_squared_error(y_test, lasso_pred, squared=False)

In [None]:
import matplotlib.pyplot as plt

# Define labels and data for each model
models = ['Linear Regression', 'Random Forest', 'Ridge Regression', 'Lasso Regression']
mae_scores = [linear_reg_mae, random_forest_mae, ridge_mae, lasso_mae]
mse_scores = [linear_reg_mse, random_forest_mse, ridge_mse, lasso_mse]
rmse_scores = [linear_reg_rmse, random_forest_rmse, ridge_rmse, lasso_rmse]

# Plotting MAE
plt.figure(figsize=(10, 5))
plt.bar(models, mae_scores, color='skyblue')
plt.title('Mean Absolute Error (MAE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.show()

# Plotting MSE
plt.figure(figsize=(10, 5))
plt.bar(models, mse_scores, color='salmon')
plt.title('Mean Squared Error (MSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('MSE')
plt.xticks(rotation=45)
plt.show()

# Plotting RMSE
plt.figure(figsize=(10, 5))
plt.bar(models, rmse_scores, color='lightgreen')
plt.title('Root Mean Squared Error (RMSE) Comparison')
plt.xlabel('Regression Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45)
plt.show()