## Import Packages and Data

In [1]:
# Standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
import re

# Machine Learning
from scipy import stats
import sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# NLP 
import string
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
import nltk
# nltk.download('wordnet')
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
earnings_calls_data = r"C:\Users\AnEnt\Documents\MSBA\BANA 275 - NLP\Project\earning_calls.csv"
df1 = pd.read_csv(earnings_calls_data)

In [3]:
# Set index
df1['Index'] = range(1, len(df1)+1)
df1.set_index('Index')
df1.head()

Unnamed: 0,href,Transcript,Ticker,Index
0,/earnings/call-transcripts/2020/04/30/apple-in...,"[""OperatorGood day, everyone. Welcome to the A...",aapl,1
1,/earnings/call-transcripts/2020/01/28/apple-in...,"['Contents:', '', 'Prepared Remarks', 'Questio...",aapl,2
2,/earnings/call-transcripts/2019/10/30/apple-in...,"[""OperatorGood day, everyone. Welcome to the A...",aapl,3
3,/earnings/call-transcripts/2019/07/30/apple-in...,"['Contents:', '', 'Prepared Remarks', 'Questio...",aapl,4
4,/earnings/call-transcripts/2019/04/30/apple-in...,"['Prepared Remarks:', 'Operator', ""Good day, a...",aapl,5


In [4]:
stock_data = r"C:\Users\AnEnt\Documents\MSBA\BANA 275 - NLP\Project\Company_Stock_Price_Clean.csv"
df2 = pd.read_csv(stock_data)
df2.head()

Unnamed: 0,Date,Ticker,Adj_Close
0,1/3/17,MSFT,58.969059
1,1/3/17,AAPL,110.392334
2,1/3/17,AMZN,753.669983
3,1/3/17,FB,116.860001
4,1/3/17,GOOGL,808.01001


## Extract Variables

### Date
- Year
- Month
- Day
- Date

In [5]:
# Split href
href = df1['href'].str.split("/",n=6,expand=True)

# Drop unnecessary axes
href1 = href.drop([0,1,2,6], axis = 1)

# Rename Axes
href1 = href1.rename(columns = {3:"Year",4:"Month",5:"Day"})
href1['Date'] = pd.to_datetime(href1[['Year', 'Month', 'Day']])

# Create Index to merge on
href1['Index'] = range(1, len(href1)+1)
href1.set_index('Index')
href1.head()

# Join to Original Dataframe 
df1 = df1.merge(href1, how = 'outer', on = 'Index' )

### Quarter

In [6]:
# Split existing href
href2 = href[6].str.split(r'-\d',n = 3, expand = True)
href3 = href2[0].str.split('-q', n = 4, expand = True)

# Rename and drop unnessecary columns
href3 = href3.rename(columns = {1:'Quarter'})
href3 = href3.drop(columns = [0,2])

# Create Index
href3['Index'] = range(1, len(href1)+1)
href3.set_index('Index')
href3.head()

# Merge to existing dataframe
df1 = df1.merge(href3, how = 'outer', on = 'Index')

### Company Name

In [7]:
# Split existing href and combine into one column
href4 = href2[0].str.split("-",n=6,expand=True)
href4['Company Name'] = href4[0]+" "+href4[1]

# Drop unnecessary columns
href4 = href4.filter(['Company Name'])

# Create Index
href4['Index'] = range(1, len(href1)+1)
href4.set_index('Index')

# Merge to existing dataframe
df1 = df1.merge(href4, how = 'outer', on = 'Index')
df1 = df1.drop(columns = ['Index'])

## Clean Transcripts

### Clean Contractions

In [8]:
# Contraction dictionary
contractions_dict = {
    "didn't": 'did not',
    "don't": 'do not',
    "aren't": 'are not',
    "can't": 'cannot',
    "could've": "could've",
    "couldn't": "could not",
    "i'll": "i will",
    "i'd": "i would",
    "i'm": "i am",
    "it'll": "it will",
    "we'll": "we will"
    
}

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)

### Basic Cleaning

In [9]:
# Standard function to clean string
def clean(string):
    x = string.lower() # lowercases the string
    x = expand_contractions(x) # replaces contractions
    x = re.sub(r'\W+', ' ', x) # takes only alpha numeric
    return x

In [10]:
df1['clean_transcript'] = df1.Transcript.apply(lambda x: clean(x))

### Change Data Types

In [11]:
df1['Date'] = pd.to_datetime(df1['Date'])

### Create dates 1 week before and after

In [12]:
df1['Date_1Week_Before'] = df1['Date'] + pd.DateOffset(days = -7)
df1['Date_1Week_After'] = df1['Date'] + pd.DateOffset(days = 7)

## Clean Stock Data

### Change Data Types

In [13]:
df2['Ticker'] = df2['Ticker'].astype(str)
df2['Ticker'] = df2['Ticker'].str.lower()
df2['Date'] = pd.to_datetime(df2['Date'])

In [14]:
df2.head()

Unnamed: 0,Date,Ticker,Adj_Close
0,2017-01-03,msft,58.969059
1,2017-01-03,aapl,110.392334
2,2017-01-03,amzn,753.669983
3,2017-01-03,fb,116.860001
4,2017-01-03,googl,808.01001


### Merge Datasets

In [15]:
test_df = pd.merge(df1, df2, on = ['Date', 'Ticker'], how = 'inner')
test2_df = pd.merge(test_df, df2, left_on = ['Date_1Week_Before', 'Ticker'],
                                             right_on = ['Date', 'Ticker'], how = 'inner')
test3_df = pd.merge(test2_df, df2, left_on = ['Date_1Week_After', 'Ticker'],
                                             right_on = ['Date', 'Ticker'], how = 'inner')
test4 = test3_df.drop(columns = ['Date_y', 'Date'])
test5 = test4.rename(columns = {'Adj_Close_y':'Price_Before', 'Adj_Close':'Price_After', 'Adj_Close_x': 'Adj_Close',
                               'Date_x':'Date'})

### Clean Transcripts in New Dataset

In [16]:
test5['clean_transcript'] = test5.Transcript.apply(lambda x: clean(x))

In [17]:
# List of stopwords
stop = list(STOPWORDS)

# Functions
# removes stop words from a clean transcript
def remove_stop(string):
    wostop = [] # empty list to append to
    word = string.split() # splits string into list
    for n, i in enumerate(word): # iterates over the list
        if i not in stop: # if word in the list not in stop words list
            wostop.append(word[n]) # append to wostop list
    wostop = ' '.join(wostop) # join wostop list into a string
    return wostop # returns a string

# Stems word
def stem(string):
    t = [] # empty list to append
    ps = nltk.stem.PorterStemmer() # stem purposes
    word = string.split() # split string into list
    for item in word: # iterate through list
        stem = ps.stem(item) # stem each word
        t.append(stem) # add new stem word into list
    t = ' '.join(t) # join list of stem words into a string
    return t # returns a string

In [18]:
test5['clean_transcript2'] = test5.clean_transcript.apply(lambda x: remove_stop(x))

In [19]:
test5 = test5.drop(columns = ['clean_transcript'])

### Reorder Columns

In [31]:
# Find index of columns
colnames = list(test5.columns)
col_dict = {}
for n,x in enumerate(colnames):
    col_dict.update({n:x})
col_dict

{0: 'href',
 1: 'Transcript',
 2: 'Ticker',
 3: 'Year',
 4: 'Month',
 5: 'Day',
 6: 'Date',
 7: 'Quarter',
 8: 'Company Name',
 9: 'Date_1Week_Before',
 10: 'Date_1Week_After',
 11: 'Adj_Close',
 12: 'Price_Before',
 13: 'Price_After',
 14: 'clean_transcript2'}

In [32]:
# Reorder column names
cols = test5.columns.tolist()
myorder = [8,2,6,11,3,7,4,5,9,12,10,13,14,1,0]
cols = [cols[i] for i in myorder]
test6 = test5[cols]
test6.head()

Unnamed: 0,Company Name,Ticker,Date,Adj_Close,Year,Quarter,Month,Day,Date_1Week_Before,Price_Before,Date_1Week_After,Price_After,clean_transcript2,Transcript,href
0,apple inc,aapl,2020-04-30,293.006836,2020,2,4,30,2020-04-23,274.287506,2020-05-07,302.919983,operatorgood day everyone welcome apple inc se...,"[""OperatorGood day, everyone. Welcome to the A...",/earnings/call-transcripts/2020/04/30/apple-in...
1,apple inc,aapl,2020-01-28,316.082184,2020,1,1,28,2020-01-21,314.967865,2020-02-04,317.236298,contents prepared remarks questions answers ca...,"['Contents:', '', 'Prepared Remarks', 'Questio...",/earnings/call-transcripts/2020/01/28/apple-in...
2,apple inc,aapl,2019-10-30,241.304398,2019,4,10,30,2019-10-23,241.225052,2019-11-06,255.172012,operatorgood day everyone welcome apple incorp...,"[""OperatorGood day, everyone. Welcome to the A...",/earnings/call-transcripts/2019/10/30/apple-in...
3,apple inc,aapl,2019-07-30,206.317688,2019,3,7,30,2019-07-23,206.376984,2019-08-06,194.676621,contents prepared remarks questions answers ca...,"['Contents:', '', 'Prepared Remarks', 'Questio...",/earnings/call-transcripts/2019/07/30/apple-in...
4,apple inc,aapl,2019-04-30,197.542618,2019,2,4,30,2019-04-23,204.246475,2019-05-07,199.698502,prepared remarks operator good day welcome app...,"['Prepared Remarks:', 'Operator', ""Good day, a...",/earnings/call-transcripts/2019/04/30/apple-in...


### Export Clean Dataset

In [33]:
# test5.to_csv('Transcripts_Clean.csv')