In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.classify
from nltk import NaiveBayesClassifier
analyzer = SentimentIntensityAnalyzer()

In [3]:
# 10 initializes the dataframe "df" and imports the csv into df; 
# 20 calls getdata to import the csv into the dataframe, 'dfAPI'
# 30 removes any duplicate records; duplicate records imply bot records
# 40 finds certain words in the strings ('body') and deletes the entire record.  
# 50Vader sentiment analyzer
# 60 creates a new column called 'compound_bin' from the raw_compound scores
# 70 converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
# 80 Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
# 90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
# 115 Provides statistics on sentiments; bullish, none or bearish.
# 120 Allows user to manually input value when stocktwits sentiment value is "None"
# 130 Loads a csv file into the df dfAPI and print out the first 21 records
# 140 This will change the modified rating to the nltk rating only when they are opposite to see if it improves the accuracy
#number 
# 150 imports the csv into the dataframe, 'dfAPI'
# 160 converts the df columns of body and the label (compound or sentiment_number) into one list for each column
#this is needed to be able to create the 
# 170 divides the whole data set into a 80/20 split into a training set and a test set




# 210 takes the test_data list, the test_labels list and the predictions list and puts them into a df


# 650 Loads and combines two different dataframes into dfAPI


METHODS

In [7]:
# methods

# 10 initializes the dataframe "df" and imports the csv into df; 
# the argument is the name/address of the file.
# https://stackoverflow.com/questions/33440805/pandas-dataframe-read-csv-on-bad-data
def getData(name):
    df1 = pd.DataFrame() # defines df1 as a dataframe
    df1 = pd.read_csv(name, header = 0)
    return df1

    #df1 = pd.read_csv(name, warn_bad_lines=True, error_bad_lines=False)
    #df1 = pd.read_csv(name, nrows = 150, warn_bad_lines=True, error_bad_lines=False)

# 30 removes any duplicate records; duplicate records imply bot records
def remove_duplicates(df):
    df = df.drop_duplicates()
    len(df)
    return df

# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.
def filter_records(df):
    import fnmatch

    data = []
    counter = 0
    advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com'] # words or phrases whose records are to be removed; It is not case sensitive.

    for a in advert:
        i = 0
        df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
        while i < len(df):
            dat = df.iloc[i,2] # 2 represents the 'body' column
            data = [dat] # sets the string from the df into a list for the fnmatch.filter
            #print('index = ', i)
            filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
            #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

            if len(filtered) != 0: #if returns a True then record needs to be removed
                counter += 1
                #print('index:', i, df.iloc[i,2]) # prints the index number and record
                #print(filtered, '\n') # prints the entire record where there was a match (not wildcards were used)    
                #print('before drop the next record is:', df.iloc[i+1, 2], 'i+1 = ', i + 1)
            
                df = df.drop(df.index[i]) # drops (deletes) the record
            
                #print('after the record is dropped:', df.iloc[i,2], 'i = ', i)
                
                #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                #the record that was just checked. That is why it takes multiple runs to remove all of the target
                #records. To correct this decrement the index, i, by one
                
                i -= 1
    
            i += 1

    df = df.reset_index(drop = True) # resets the index; removes the gaps   
    len(df)
    return df

#50 Vader sentiment analyzer
def vader_sentiment(df):
    vader = SentimentIntensityAnalyzer()

    f = lambda tweet: vader.polarity_scores(tweet)['compound']

    df['raw_compound'] = df['body'].apply(f)

    print('The number of clean records in the df are: ', len(df) , '\n')
    print(df.head())
    
    return df

# 60 creates a new column called 'compound_bin' from the raw_compound scores. This creates a column that the raw 
#where the translated raw compound scores will be placed (either a -1, 0, 1.)
def compound_binning(df):
    df['compound_bin'] = df['raw_compound'] 
    
    #del df['Unnamed: 0'] # deletes the column named 'Unnamed: 0'
    
    print(df.head())
    
    # 70 converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
    #and -1 if <= -.1 and over-rights the value in compound_bin

    i = 0
    while i < len(df):
        if df.iloc[i,5] >= 0.1: # column 5 is 'raw_compound'
            df.iloc[i, 6] =  np.int(df.iloc[i, 5] + .9) # column 6 is 'compound_bin'
        
        if df.iloc[i,5] < .1 and df.iloc[i, 5] > -.1:
            df.iloc[i, 6] = 0   
        
        if df.iloc[i,5] <= -.1:
            df.iloc[i, 6] =  np.int(df.iloc[i, 5] - .9)
        i += 1
    
    print(df)
    
    return df

# 80 Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
#Stocktwits sentiment rating (bullish or Bearish) is used as the standard;
#Stocktwits sentiment rating of 'None' is not used as a standard because people could have simply elected to not enter it.
#https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/
def convert_sentiment_to_numerical(df):
    import numpy as np

    conditions = [(df['sentiment'] == 'Bullish'),
                  (df['sentiment'] == 'None'),
                  (df['sentiment'] == 'Bearish')]

    values = [1.0, 0.0, -1.0]

    df['sentiment_number'] = np.select(conditions, values)

    df['modified_rating'] = 0 # adds a column "modified_rating" and sets it equal to 0
    df['modified?'] = 'No' # adds a column "modified?" and sets it equal to 'No'


    print(df)
    
    return df

# 90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
def vader_correct(df):
    correct = 0
    incorrect = 0
    total = len(df)
    i = 0
    while i < len(df):
        if df.iloc[i, 6] == df.iloc[i, 7]: # column 6 is 'compound_bin' and column 7 is 'sentiment_number'
            correct += 1
        else:
            incorrect += 1 
        
        i += 1
        
    print('The Vader percent correct to stocktwits raw data is:', int(100 * correct/total), '%')
    print('The Vader percent incorrect to stocktwits raw data is:', int(100 * incorrect/total), '%')

    #return df

# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
def none_count_raw(df):
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
        
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 0
    counter = 0

    while i < len(df):
        if df.iloc[i,4] == 'None':
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                df = df.drop(df.index[i]) #drops (deletes) the record
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('\nThe total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '% \n')
            
    return df    

# 115 Provides statistics on sentiments; bullish, none or bearish.
def stats(df):
    
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('The total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,4] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
# 120 Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"
def edit(df):

    import copy

    filename = "tech stockTwit 03112021 adjusted Rev1.csv"
    
    print('The name of the csv file that will be written to is: ', filename)
    
    correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')
          
    if correct_name == 'N' or correct_name == 'n':
          new_name = input('What is the correct name?')
          filename = new_name
          
    load = input('Is this the first time processing the raw stocktwits data (enter "n"/"N" or "y"/"Y")? ')
    if load == 'n' or load == 'N' or load == 'no' or load == 'No':
        df = getData(filename)
        print('Loaded filename:', filename)
    else:
    
        print('ok')
    
    i = 0
    counter = 0    # counter to see if user want to stop

    while i < len(df):
    #while i < 6:

        if df.iloc[i,4] == 'None' and df.iloc[i,9] == 'No':
            print('\nindex number:', i, '\n', df.iloc[i, 2])
            #print('This is the body of the tweet:\n', df.iloc[i, 2])
            rating = int(input('Enter your rating (1, 0 or -1.):')) 
            df.iloc[i,8] = copy.deepcopy(rating) # writes inputed number to the 'modified_rating'
            df.iloc[i,9] = 'Yes' # sets "modified?" equal to 'Yes' to identify which records have been modified; so that it can start at the next record at start up
        
            counter += 1
        
        elif df.iloc[i,4] == 'Bearish':
        #elif df.iloc[i,4] == 'Bearish' and df.iloc[i,9] == 'No': # the second condition is not needed

            df.iloc[i,8] = df.iloc[i,7] #copies the stocktwits 'sentiment_number' to the 'modified_rating'
        
        elif df.iloc[i,4] == 'Bullish':
        #elif df.iloc[i,4] == 'Bullish' and df.iloc[i,9] == 'No': # the second condition is not needed
        
            df.iloc[i,8] = df.iloc[i,7] #copies the stocktwits 'sentiment_number' to the 'modified_rating'

        if counter == 20: # represents 20 edits
            quit = input('Do you want to quit? (Enter either a "y" or "Y") ')
            if quit == 'y' or quit == 'Y':
                print('You are exiting.')
                break
            else:
                counter = 0 # resets the counter to 0 so there must be another 20 records reviewed and modified 
        
        i += 1
    
    df.to_csv(filename, index = False)
    print('The csv file was written. File name: ', filename)
    
    return df

# 140 This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
#the accuracy number 
def change_opp_nltk(df):
    
    filename = 'tech stockTwit 02232021 opposite compound_bin vs modified_rating.csv'
    
    print('The name of the csv file that will be written to is: ', filename)
    
    correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')
          
    if correct_name == 'N' or correct_name == 'n':
          new_name = input('What is the correct name?')
          filename = new_name
    
    i = 0

    import copy

    counter = 0    # counter to see if user want to stop

    while i < len(df):

        if df.iloc[i,6] == -1 and df.iloc[i, 8] == 1:
            df.iloc[i,8] = copy.deepcopy(df.iloc[i, 6]) # change "modified rating" to "compound_bin"       
        
        elif df.iloc[i,6] == 1 and df.iloc[i, 8] == -1:
            df.iloc[i,8] = copy.deepcopy(df.iloc[i, 6]) # change "modified rating" to "compound_bin"     

        i += 1
    
    df.to_csv(filename, index = False)
    print('The csv file was written. File name: ', filename)
    
    return df

# 180 counts how many "None" sentiment values are there for the stocktwits sentiment value
def none_count(df):
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,8] == 0.0:
            sentiment_number += 1
        i +=1
        
    '''
    while i < len(test_labels):
        if test_labels[i] == 0.0:
            sentiment_number += 1
        i += 1
    '''
    
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
    
#440 sets up stopword removal; returns stopWords
def set_up_nltk_stopword_removal():
    #from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    import nltk
    nltk.download('stopwords')
    stopWords = set(stopwords.words('english'))

    print(len(stopWords))
    return stopWords

#470 creates a list of new stopwords and then adds them to the set provided by nltk
#Note: it is case sensitive
#Input is the nltk stopword list ("stopWords")
def add_new_stopwords(sw):
    newStopWords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    newStopWords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    newStopWords += ['already', 'also', 'although', 'always', 'am', 'among']
    newStopWords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    newStopWords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    newStopWords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    newStopWords += ['because', 'become', 'becomes', 'becoming', 'been']
    newStopWords += ['before', 'beforehand', 'behind', 'being', 'below']
    newStopWords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    newStopWords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    newStopWords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    newStopWords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    newStopWords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    newStopWords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    newStopWords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    newStopWords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
    newStopWords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    newStopWords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    newStopWords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    newStopWords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    newStopWords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    newStopWords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    newStopWords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    newStopWords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
    newStopWords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    newStopWords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    newStopWords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    newStopWords += ['nevertheless', 'next', 'nine', 'nobody', 'none'] #removed 'no'
    newStopWords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    newStopWords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
    newStopWords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    newStopWords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    newStopWords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    newStopWords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    newStopWords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    newStopWords += ['some', 'somehow', 'someone', 'something', 'sometime']
    newStopWords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    newStopWords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    newStopWords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    newStopWords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    newStopWords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    newStopWords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    newStopWords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
    newStopWords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    newStopWords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    newStopWords += ['whatever', 'when', 'whence', 'whenever', 'where']
    newStopWords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    newStopWords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    newStopWords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    newStopWords += ['within', 'without', 'would', 'yet', 'you', 'your']
    newStopWords += ['yours', 'yourself', 'yourselves'] #provided by Codecademy??

    # additional stopwords:
    newStopWords += ['[Screenshot]', '[screenshot]', 'Screenshot', '[Screenshot]Great', '[SCREENSHOT]', 'screenshot', 
                 'The', 'the', 'SMART', 'yah', 'got', 'nutty', 'moving', 'weeks', 'Got', 'So', 'today', 'Been', 'or']

    newStopWords += ['I', 'it', 'It'] # pronouns

    newStopWords += ['AMD', 'NVDA','NVDA', 'TSLA', 'GOOG', 'BA', 'FB', 'GOOGL', 'INTC', 'intel', 'Intel', 'CSCO', 'MU', 
                 'SMH', 'TSM','AAPL', 'TSLA', 'CSCO', 'POETF', 'PHOTONICS', 'DD', 'ARWR', 'T', 'INFI', 'AMC', 'ARK',
                'GME', 'NIO', 'QS'] # Stock symbols or names

    newStopWords += ['Readytogo123', 'Maddog68','Stocktwits'] # nouns

    newStopWords += ['.', '?', '!', ';', ',', "'"] # punctuation

    newStopWords += ['&', '#', '%', '$', '@'] # symbols

    newStopWords += ['41.75', '530.05', '39', 'Two', 'two',] # numbers

    #adds them to the stopWords list provided by nltk
    for i in newStopWords:
        sw.add(i) #stopWords is defined as a "set" in #450 when inputed as english words from nltk;
        # sets cannot be ordered so it must be converted back to a list to be ordered or alphabetized. A set has no duplicate elements.

    print(len(sw))
    #print(stopWords)

    #converts the set to a list
    stopWords_list = list(sw)

    #sorts the stopword list
    stopWords_list.sort(key = lambda k : k.lower())
    print(stopWords_list)
    
    return stopWords_list

#480 This removes words from the list of stopwords and writes list to csv file
# https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python#:~:text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
#new_words = list(filter(lambda w: w not in stop_words, initial_words))
def remove_from_stopwords(sw):
    WordsToBeRem = ['no']
    stopWords = list(filter(lambda w: w not in WordsToBeRem, sw)) #sw has been sorted in #470

    #converts the stopword list to a df and then outputs the df to a csv file
    df_stopwords = pd.DataFrame(stopWords, columns = ['stopwords'])
    df_stopwords.to_csv('stopwords.csv', index = False)

    print(stopWords)
    
    return stopWords

#490 Checks to see of the words were removed from the stopWords list.
#inputs: stopword list: output from def remove_from_stopwords(sw); the word to be removed
def check_stopwords(sw, WordToBeRem):
    
    r = 0

    for w in sw:
        #print(w)
        if w in WordToBeRem:
            print('The word ', w , ' is still in the stopWords list!')
            r += 1

    if r == 0:
        print('It did remove the words from the stopWords list!')
    
    #print(len(stopWords))

#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
# the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
# match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
# It then joins the individual words back into a string.

#There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
# separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
#dfScrubbed = df #This is a shallow copy
def rem_stopwords(df, stopWords):
    
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    dfScrubbed = df.copy() #This is a deep copy. df.copy(deep = True); deep = True is default

    i = 0
    while i < len(df):
    
        data = df.iloc[i,2]
        words = word_tokenize(data)
        wordsFiltered = []

        for w in words:
            if w not in stopWords:
                wordsFiltered.append(w)
    
        joinedWordsFiltered = ' '.join(wordsFiltered)
    
        dfScrubbed.iloc[i,2] = joinedWordsFiltered # replaces the recorded in dfScrubbed with the stopWords removed
        # from the 'body'
    
        i += 1
    
    #print(wordsFiltered)

    print(dfScrubbed.head())

    #print(joinedWordsFiltered)
    
    return dfScrubbed

#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
# this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
def int_conversion(dfs):
    dfs['scrubbed_compound'] =  np.int64((dfs['scrubbed_compound'] + .05) * 10)

# 550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
# if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
# creates a new column called 'compound_bin' from the raw_compound scores
def bin_sentiment(dfs):
    dfs['scrubbed_compound_bin'] = dfs['scrubbed_compound'] # creates a new column 'scrubbed_compound_bin' (column 11)

    i = 0
    while i < len(df):
        if dfs.iloc[i,10] >= 0.1: # column 10 is 'scrubbed_compound'
            dfs.iloc[i, 11] =  np.int(dfs.iloc[i, 10] + .9) # column 11 is 'scurbbed_compound_bin'
        
        if dfs.iloc[i,10] < .1 and dfs.iloc[i, 10] > -.1:
            dfs.iloc[i, 11] = 0   
        
        if dfs.iloc[i,10] <= -.1:
            dfs.iloc[i, 11] =  np.int(dfs.iloc[i, 10] - .9)
        i += 1
    
    print(dfs)

# compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
#inputs: df - original df; dfs - scrubbed df (stopwords removed)
def compare_scrubbed(df, dfs):
    print(df.iloc[0,2])
    print(dfs.iloc[0,2])

# 650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
#values have been modified; this is to see if increased records will increase the accuracy of the model.
def combine_dfs():

    filename1 = "tech stockTwit 03112021 adjusted-Copy1.csv"
    filename2 = "tech stockTwit 02232021 adjusted-Copy1.csv"

    df1 = getData(filename1)
    df2 = getData(filename2)

    df = df1.append(df2)

    print('The length of file 1 is:', len(df1))
    print('The length of file 2 is:', len(df2))

    print('The length of the combined dataframe is:', len(df))

# Writes a csv file
#input: df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'
def write_csv(df, filename_output):
    df.to_csv(filename_output, index = False)
    #df.to_csv(filename_output, index = False)
    print('The csv file was written. File name: ', filename_output)


MAIN

In [9]:


name = 'tech stockTwit 03112021.csv'
df = getData(name) #returns df; reads csv file into df

#OPTIONAL:
df = remove_duplicates(df) #return df; removes duplicates

#OPTIONAL:
#df = filter_records(df) #returns df; removes addvertisements

# NOT OPTIONAL
df = vader_sentiment(df) #returns df; adds column with Vader sentiment values ('raw_compound') from the 'body' column.

# NOT OPTIONAL
df = compound_binning(df) #returns df; adds a column where the raw_compound scores are translated into 1, 0 or -1 'compound_bin'

#80 NOT OPTIONAL
df = convert_sentiment_to_numerical(df) #returns df

# 90 OPTIONAL
#vader_correct(df) 

# 100 OPTIONAL: Counts how many "None" sentiment values are there for the stocktwits sentiment value
#none_count_raw(df) 

# 110 OPTIONAL: This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
#df = remove_every_other(df) #returns df

# 115 OPTIONAL: Provides statistics on sentiments; bullish, none or bearish.
#stats(df) 


# 120 OPTIONAL: Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"
#df = edit(df) #returns df

# OPTIONAL:
csv_name = 'tech stockTwit 03112021 adjusted Rev1.csv'
df = getData(csv_name)

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment value
#none_count(df) 

# 140 OPTIONAL: This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
#the accuracy number 
#df = change_opp_nltk(df) #returns df

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment value
#none_count(df) 

#440 imports the nltk stopword list that holds the stopwords that will be removed from the text ('body.') 
sw = set_up_nltk_stopword_removal() 

#470 creates a list of new stopwords and then adds them to the set provided by nltk
#Note  it is case sensitive
#Input is the nltk stopword list ("stopWords")
sw = add_new_stopwords(sw) 

#480 This removes words from the list of stopwords and writes list to csv file
# https //stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python# ~ text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
#new_words = list(filter(lambda w  w not in stop_words, initial_words))
sw = remove_from_stopwords(sw) 
#return stopWords

#490 Checks to see of the words were removed from the stopWords list.
#inputs  stopword list  output from  remove_from_stopwords(sw); the word to be removed
check_stopwords(sw, 'no') 

#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
# the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
# match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
# It then joins the individual words back into a string.
#There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
# separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
#dfAPIScrubbed = dfAPI #This is a shallow copy
dfScrubbed = rem_stopwords(df, sw) 
#return dfScrubbed

#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
# this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
#int_conversion(dfs) #return df

# compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
#inputs  df - original df; dfs - scrubbed df (stopwords removed)
dfs = dfScrubbed
compare_scrubbed(df, dfs) 

# 650 Loads and combines two different dataframes in dfAPI; this is to combine two input datasets where the 'none'
#values have been modified; this is to see if increased records will increase the accuracy of the model.
#combine_dfs() 

# Writes a csv file
#input  df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'
#filename_output = 'tech stockTwit 03112021 dup.csv'
filename_output = 'tech stockTwit 03112021 adjusted Rev1.csv'

write_csv(df, filename_output) 




The number of clean records in the df are:  2418 

  symbol            created_at  \
0   INTC  2021-03-05T21:01:03Z   
1   INTC  2021-03-05T21:01:03Z   
2   INTC  2021-03-05T21:00:02Z   
3   INTC  2021-03-05T20:51:14Z   
4   INTC  2021-03-05T20:06:56Z   

                                                body followers sentiment  \
0  $INTC Big Trade - $16 399 800.270 000 shares a...       862      None   
1  Large Print $INTC Size: 270000 Price: 60.74 Ti...      5502      None   
2  Huge Print $INTC Size: 4033477 Price: 60.74 Ti...      5502      None   
3               $AMD common follow ur sibs $INTC $MU        48   Bullish   
4                $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   

   raw_compound  
0        0.2960  
1        0.0000  
2        0.3182  
3        0.0000  
4        0.0000  
  symbol            created_at  \
0   INTC  2021-03-05T21:01:03Z   
1   INTC  2021-03-05T21:01:03Z   
2   INTC  2021-03-05T21:00:02Z   
3   INTC  2021-03-05T20:51:14Z   
4   INTC 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pstri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  symbol            created_at  \
0   INTC  2021-03-05T20:51:14Z   
1   INTC  2021-03-05T20:06:56Z   
2   INTC  2021-03-05T19:57:20Z   
3   INTC  2021-03-05T19:52:43Z   
4   INTC  2021-03-05T19:36:13Z   

                                                body followers sentiment  \
0                              common follow ur sibs        48   Bullish   
1                                 ITT ADBE OPTT GLBS       575   Bullish   
2  Should thankful bull market Dow couple reachin...        21   Bullish   
3  ButterFingerDROPs sell October Still probably ...        77      None   
4                       At rate left like flew - : (        11      None   

   raw_compound  compound_bin  sentiment_number  modified_rating modified?  
0        0.0000           0.0               1.0              1.0        No  
1        0.0000           0.0               1.0              1.0        No  
2        0.6996           1.0               1.0              1.0        No  
3        0.0000           0.0 

In [21]:
#df = df.reset_index(drop = True)
print(df)

     symbol            created_at  \
0      INTC  2021-03-05T20:51:14Z   
1      INTC  2021-03-05T20:06:56Z   
2      INTC  2021-03-05T19:57:20Z   
3      INTC  2021-03-05T19:52:43Z   
4      INTC  2021-03-05T19:36:13Z   
...     ...                   ...   
1286     MU  2021-02-24T13:22:35Z   
1287     MU  2021-02-24T12:48:31Z   
1288     MU  2021-02-24T12:38:21Z   
1289     MU  2021-02-24T12:10:44Z   
1290     MU  2021-02-24T12:10:09Z   

                                                   body followers sentiment  \
0                  $AMD common follow ur sibs $INTC $MU        48   Bullish   
1                   $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   
2     $INTC Should be thankful we are in this bull m...        21   Bullish   
3     @ButterFingerDROPs $INTC had its sell off back...        77      None   
4     $AMD At this rate  this will be left behind by...        11      None   
...                                                 ...       ...       ...   
12

In [10]:
# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
none_count_raw(df) 


The number of "None" stocktwits sentiment values is: 481
The percentage of "None" values is: 37.2 %


In [None]:
MAIN

  symbol            created_at  \
0   INTC  2021-03-05T21:01:03Z   
1   INTC  2021-03-05T21:01:03Z   
2   INTC  2021-03-05T21:00:02Z   
3   INTC  2021-03-05T20:51:14Z   
4   INTC  2021-03-05T20:06:56Z   
5   INTC  2021-03-05T19:57:20Z   
6   INTC  2021-03-05T19:52:43Z   
7   INTC  2021-03-05T19:44:47Z   
8   INTC  2021-03-05T19:36:13Z   
9   INTC  2021-03-05T19:27:49Z   

                                                body followers sentiment  
0  $INTC Big Trade - $16 399 800.270 000 shares a...       862      None  
1  Large Print $INTC Size: 270000 Price: 60.74 Ti...      5502      None  
2  Huge Print $INTC Size: 4033477 Price: 60.74 Ti...      5502      None  
3               $AMD common follow ur sibs $INTC $MU        48   Bullish  
4                $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish  
5  $INTC Should be thankful we are in this bull m...        21   Bullish  
6  @ButterFingerDROPs $INTC had its sell off back...        77      None  
7  $INTC  Trading is easy

2500

METHODS IN INDIVIDUAL CELLS

In [3]:
# 10 initializes the dataframe "df" and imports the csv into df; 
# the argument is the name/address of the file.
# https://stackoverflow.com/questions/33440805/pandas-dataframe-read-csv-on-bad-data
def getData(name):
    df1 = pd.DataFrame() # defines df1 as a dataframe
    df1 = pd.read_csv(name, header = 0)
    return df1

    #df1 = pd.read_csv(name, warn_bad_lines=True, error_bad_lines=False)
    #df1 = pd.read_csv(name, nrows = 150, warn_bad_lines=True, error_bad_lines=False)


In [2]:
# 30 removes any duplicate records; duplicate records imply bot records
def remove_duplicates(df):
    df = df.drop_duplicates()
    len(df)
    return df

In [3]:
# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.

def filter_records(df):
    import fnmatch

    data = []
    counter = 0
    advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com'] # words or phrases whose records are to be removed; It is not case sensitive.

    for a in advert:
        i = 0
        df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(dfAPI)
        while i < len(df):
            dat = df.iloc[i,2] # 2 represents the 'body' column
            data = [dat] # sets the string from the df into a list for the fnmatch.filter
            #print('index = ', i)
            filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
            #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

            if len(filtered) != 0: #if returns a True then record needs to be removed
                counter += 1
                #print('index:', i, df.iloc[i,2]) # prints the index number and record
                #print(filtered, '\n') # prints the entire record where there was a match (not wildcards were used)    
                #print('before drop the next record is:', df.iloc[i+1, 2], 'i+1 = ', i + 1)
            
                df = df.drop(df.index[i]) # drops (deletes) the record
            
                #print('after the record is dropped:', df.iloc[i,2], 'i = ', i)
                
                #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                #the record that was just checked. That is why it takes multiple runs to remove all of the target
                #records. To correct this decrement the index, i, by one
                
                i -= 1
    
            i += 1

    df = df.reset_index(drop = True) # resets the index; removes the gaps   
    len(df)
    return df


In [4]:
#50 Vader sentiment analyzer

def vader_sentiment(df):
    vader = SentimentIntensityAnalyzer()

    f = lambda tweet: vader.polarity_scores(tweet)['compound']

    df['raw_compound'] = df['body'].apply(f)

    print('The number of clean records in the df are: ', len(df) , '\n')
    print(df.head())
    
    return df

In [7]:
# 60 creates a new column called 'compound_bin' from the raw_compound scores. This creates a column that the raw 
#where the translated raw compound scores will be placed (either a -1, 0, 1.)

def compound_binning(df):
    df['compound_bin'] = df['raw_compound'] 
    
    #del dfAPI['Unnamed: 0'] # deletes the column named 'Unnamed: 0'
    
    print(dfAPI.head())
    
    # 70 converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
    #and -1 if <= -.1 and over-rights the value in compound_bin

    i = 0
    while i < len(dfAPI):
        if df.iloc[i,5] >= 0.1: # column 5 is 'raw_compound'
            df.iloc[i, 6] =  np.int(df.iloc[i, 5] + .9) # column 6 is 'compound_bin'
        
        if df.iloc[i,5] < .1 and df.iloc[i, 5] > -.1:
            df.iloc[i, 6] = 0   
        
        if df.iloc[i,5] <= -.1:
            df.iloc[i, 6] =  np.int(df.iloc[i, 5] - .9)
        i += 1
    
    print(df)
    
    return df

In [6]:
# 80 Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
#Stocktwits sentiment rating (bullish or Bearish) is used as the standard;
#Stocktwits sentiment rating of 'None' is not used as a standard because people could have simply elected to not enter it.
#https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/

def convert_sentiment_to_numerical(df):
    import numpy as np

    conditions = [(df['sentiment'] == 'Bullish'),
                  (df['sentiment'] == 'None'),
                  (df['sentiment'] == 'Bearish')]

    values = [1.0, 0.0, -1.0]

    df['sentiment_number'] = np.select(conditions, values)

    df['modified_rating'] = 0 # adds a column "modified_rating" and sets it equal to 0
    df['modified?'] = 'No' # adds a column "modified?" and sets it equal to 'No'


    print(df)
    
    return df

In [33]:
# 90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values

def vader_correct(df):
    correct = 0
    incorrect = 0
    total = len(df)
    i = 0
    while i < len(df):
        if df.iloc[i, 6] == df.iloc[i, 7]: # column 6 is 'compound_bin' and column 7 is 'sentiment_number'
            correct += 1
        else:
            incorrect += 1 
        
        i += 1
        
    print('The Vader percent correct to stocktwits raw data is:', int(100 * correct/total), '%')
    print('The Vader percent incorrect to stocktwits raw data is:', int(100 * incorrect/total), '%')

    return df
        

The Vader percent correct to stocktwits raw data is: 40 %
The Vader percent incorrect to stocktwits raw data is: 59 %


"\ncorrect = 0\nincorrect = 0\ntotal = len(dfAPI)\ni = 0\nwhile i < len(dfAPI):\n    if dfAPI.iloc[i, 7] == dfAPI.iloc[i, 9]:\n        correct += 1\n    else:\n        incorrect += 1 \n        \n    i += 1\n\nprint('The Vader percent correct compared to stocktwit enhanced is:', int(100 * correct/total), '%')\nprint('The Vader percent incorrect compared to stocktwits enhanced is:', int(100 * incorrect/total), '%')\n\n"

In [34]:
# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value

def none_count(df):
    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if df.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')
        

The number of "None" stocktwits sentiment values is: 1408
The percentage of "None" values is: 56.3 %


In [8]:
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 0
    counter = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'None':
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                dfAPI = dfAPI.drop(dfAPI.index[i]) #drops (deletes) the record
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('\n The total number of records is: ', len(dfAPI))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')
            
    return df    
    

In [None]:
# convert to methods below this point ******

In [35]:
# 115 Provides statistics on sentiments; bullish, none or bearish.

def stats(df):
    
    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'None':
            sentiment_number += 1
        i += 1

    print('The total number of records is: ', len(dfAPI))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(dfAPI):
        if dfAPI.iloc[i,4] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(dfAPI) * 1000)/10), '%')
            
        

The total number of records is:  2500
The number of "None" stocktwits sentiment values is: 1408
The percentage of "None" values is: 56.3 %
The number of "Bullish" stocktwits sentiment values is: 840
The percentage of "Bullish" values is: 33.6 %
The number of "Bearish" stocktwits sentiment values is: 169
The percentage of "Bearish" values is: 6.7 %


In [25]:
# 120 Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"

def edit(df):

    import copy

    filename = "tech stockTwit 03112021 adjusted Rev1.csv"
    
    print('The name of the csv file that will be written to is: ', filename)
    
    correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')
          
    if correct_name == 'N' or correct_name == 'n':
          new_name = input('What is the correct name?')
          filename = new_name

    load = input('Is this the first time processing the raw stocktwits data (enter "n"/"N" or "y"/"Y")? ')
    if load == 'n' or load == 'N' or load == 'no' or load == 'No':
        df = getData(filename)
        print('Loaded filename:', filename)
    else:
    
        print('ok')
    
    i = 0
    counter = 0    # counter to see if user want to stop

    while i < len(df):
    #while i < 6:

        if df.iloc[i,4] == 'None' and df.iloc[i,9] == 'No':
            print('\nindex number:', i, '\n', df.iloc[i, 2])
            #print('This is the body of the tweet:\n', df.iloc[i, 2])
            rating = int(input('Enter your rating (1, 0 or -1.):')) 
            df.iloc[i,8] = copy.deepcopy(rating) # writes inputed number to the 'modified_rating'
            df.iloc[i,9] = 'Yes' # sets "modified?" equal to 'Yes' to identify which records have been modified; so that it can start at the next record at start up
        
            counter += 1
        
        elif df.iloc[i,4] == 'Bearish':
        #elif df.iloc[i,4] == 'Bearish' and df.iloc[i,9] == 'No': # the second condition is not needed

            df.iloc[i,8] = df.iloc[i,7] #copies the stocktwits 'sentiment_number' to the 'modified_rating'
        
        elif df.iloc[i,4] == 'Bullish':
        #elif df.iloc[i,4] == 'Bullish' and df.iloc[i,9] == 'No': # the second condition is not needed
        
            df.iloc[i,8] = df.iloc[i,7] #copies the stocktwits 'sentiment_number' to the 'modified_rating'

        if counter == 20: # represents 20 edits
            quit = input('Do you want to quit? (Enter either a "y" or "Y") ')
            if quit == 'y' or quit == 'Y':
                print('You are exiting.')
                break
            else:
                counter = 0 # resets the counter to 0 so there must be another 20 records reviewed and modified 
        
        i += 1
    
    df.to_csv(filename, index = False)
    print('The csv file was written. File name: ', filename)
    
    return df

In [42]:
# 140 This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
#the accuracy number 

def change_opp_nltk(df):
    filename = 'tech stockTwit 02232021 opposite compound_bin vs modified_rating.csv'

    print('The name of the csv file that will be written to is: ', filename)
    
    correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')
          
    if correct_name == 'N' or correct_name == 'n':
          new_name = input('What is the correct name?')
          filename = new_name
    
    i = 0
    
    import copy

    counter = 0    # counter to see if user want to stop

    while i < len(df):

        if df.iloc[i,6] == -1 and df.iloc[i, 8] == 1:
            df.iloc[i,8] = copy.deepcopy(df.iloc[i, 6]) # change "modified rating" to "compound_bin"       
        
        elif df.iloc[i,6] == 1 and df.iloc[i, 8] == -1:
            df.iloc[i,8] = copy.deepcopy(df.iloc[i, 6]) # change "modified rating" to "compound_bin"     

        i += 1
    
    df.to_csv(filename, index = False)
    print('The csv file was written. File name: ', filename)
    
    return df

In [33]:
# 180 counts how many "None" sentiment values are there for the stocktwits sentiment value

def none_count(df):
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.iloc[i,8] == 0.0:
            sentiment_number += 1
        i +=1
        
    '''
    while i < len(test_labels):
        if test_labels[i] == 0.0:
            sentiment_number += 1
        i += 1
    '''
    
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
        

nltk stopwords

In [44]:
#440 sets up stopword removal; returns stopWords
def set_up_nltk_stopword_removal():
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    import nltk
    nltk.download('stopwords')
    stopWords = set(stopwords.words('english'))

    print(len(stopWords))
    return stopWords

In [51]:
#470 creates a list of new stopwords and then adds them to the set provided by nltk
#Note: it is case sensitive
#Input is the nltk stopword list ("stopWords")

def add_new_stopwords(sw):
    newStopWords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    newStopWords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    newStopWords += ['already', 'also', 'although', 'always', 'am', 'among']
    newStopWords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    newStopWords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    newStopWords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    newStopWords += ['because', 'become', 'becomes', 'becoming', 'been']
    newStopWords += ['before', 'beforehand', 'behind', 'being', 'below']
    newStopWords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    newStopWords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    newStopWords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    newStopWords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    newStopWords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    newStopWords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    newStopWords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    newStopWords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
    newStopWords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    newStopWords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    newStopWords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    newStopWords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    newStopWords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    newStopWords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    newStopWords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    newStopWords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
    newStopWords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    newStopWords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    newStopWords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    newStopWords += ['nevertheless', 'next', 'nine', 'nobody', 'none'] #removed 'no'
    newStopWords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    newStopWords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
    newStopWords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    newStopWords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    newStopWords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    newStopWords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    newStopWords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    newStopWords += ['some', 'somehow', 'someone', 'something', 'sometime']
    newStopWords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    newStopWords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    newStopWords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    newStopWords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    newStopWords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    newStopWords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    newStopWords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
    newStopWords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    newStopWords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    newStopWords += ['whatever', 'when', 'whence', 'whenever', 'where']
    newStopWords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    newStopWords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    newStopWords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    newStopWords += ['within', 'without', 'would', 'yet', 'you', 'your']
    newStopWords += ['yours', 'yourself', 'yourselves'] #provided by Codecademy??

    # additional stopwords:
    newStopWords += ['[Screenshot]', '[screenshot]', 'Screenshot', '[Screenshot]Great', '[SCREENSHOT]', 'screenshot', 
                 'The', 'the', 'SMART', 'yah', 'got', 'nutty', 'moving', 'weeks', 'Got', 'So', 'today', 'Been', 'or']

    newStopWords += ['I', 'it', 'It'] # pronouns

    newStopWords += ['AMD', 'NVDA','NVDA', 'TSLA', 'GOOG', 'BA', 'FB', 'GOOGL', 'INTC', 'intel', 'Intel', 'CSCO', 'MU', 
                 'SMH', 'TSM','AAPL', 'TSLA', 'CSCO', 'POETF', 'PHOTONICS', 'DD', 'ARWR', 'T', 'INFI', 'AMC', 'ARK',
                'GME', 'NIO', 'QS'] # Stock symbols or names

    newStopWords += ['Readytogo123', 'Maddog68','Stocktwits'] # nouns

    newStopWords += ['.', '?', '!', ';', ',', "'"] # punctuation

    newStopWords += ['&', '#', '%', '$', '@'] # symbols

    newStopWords += ['41.75', '530.05', '39', 'Two', 'two',] # numbers

    #adds them to the stopWords list provided by nltk
    for i in newStopWords:
        sw.add(i) #stopWords is defined as a "set" in #450 when inputed as english words from nltk;
        # sets cannot be ordered so it must be converted back to a list to be ordered or alphabetized. A set has no duplicate elements.

    print(len(sw))
    #print(stopWords)

    #converts the set to a list
    stopWords_list = list(sw)

    #sorts the stopword list
    stopWords_list.sort(key = lambda k : k.lower())
    print(stopWords_list)
    
    return stopWords_list



442
['!', '#', '$', '%', '&', "'", ',', '.', '39', '41.75', '530.05', ';', '?', '@', '[Screenshot]', '[SCREENSHOT]', '[screenshot]', '[Screenshot]Great', 'a', 'AAPL', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'AMC', 'AMD', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'ARK', 'around', 'ARWR', 'as', 'at', 'BA', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'Been', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldn', "couldn't", 'couldnt', 'cry', 'CSCO', 'd', 'DD', 'de', 'describe', 'detail', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'done', '

In [179]:
type(stopWords)


set

In [52]:
#480 This removes words from the list of stopwords and writes list to csv file
# https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python#:~:text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
#new_words = list(filter(lambda w: w not in stop_words, initial_words))

def remove_from_stopwords(sw):
    WordsToBeRem = ['no']
    stopWords = list(filter(lambda w: w not in WordsToBeRem, sw)) #sw has been sorted in #470

    #converts the stopword list to a df and then outputs the df to a csv file
    df_stopwords = pd.DataFrame(stopWords, columns = ['stopwords'])
    df_stopwords.to_csv('stopwords.csv', index = False)

    print(stopWords)
    
    return stopWords


['!', '#', '$', '%', '&', "'", ',', '.', '39', '41.75', '530.05', ';', '?', '@', '[Screenshot]', '[SCREENSHOT]', '[screenshot]', '[Screenshot]Great', 'a', 'AAPL', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'AMC', 'AMD', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'ARK', 'around', 'ARWR', 'as', 'at', 'BA', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'Been', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'computer', 'con', 'could', 'couldn', "couldn't", 'couldnt', 'cry', 'CSCO', 'd', 'DD', 'de', 'describe', 'detail', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'done', 'down

"\n# this also works; from same link as above\nWordsToBeRem = ['no', 'mill']\nfor remword in list(stopWords):\n    if remword in WordsToBeRem:\n        stopWords_list.remove(remword)\n"

In [182]:
#490 Checks to see of the words were removed from the stopWords list.
#inputs: stopword list: output from def remove_from_stopwords(sw); the word to be removed

def check_stopwords(sw, WordToBeRem):
    
    r = 0

    for w in sw:
        #print(w)
        if w in WordToBeRem:
            print('The word ', w , ' is still in the stopWords list!')
            r += 1

    if r == 0:
        print('It did remove the words from the stopWords list!')
    
    #print(len(stopWords))

It did remove the words from the stopWords list!


In [53]:
#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
# the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
# match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
# It then joins the individual words back into a string.

#There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
# separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
#dfAPIScrubbed = dfAPI #This is a shallow copy

def rem_stopwords(df, stopWords):

    dfScrubbed = dfAPI.copy() #This is a deep copy. dfAPI.copy(deep = True); deep = True is default

    i = 0
    while i < len(df):
    
        data = df.iloc[i,2]
        words = word_tokenize(data)
        wordsFiltered = []

        for w in words:
            if w not in stopWords:
                wordsFiltered.append(w)
    
        joinedWordsFiltered = ' '.join(wordsFiltered)
    
        dfScrubbed.iloc[i,2] = joinedWordsFiltered # replaces the recorded in dfAPIScrubbed with the stopWords removed
        # from the 'body'
    
        i += 1
    
    #print(wordsFiltered)

    print(dfScrubbed.head())

    #print(joinedWordsFiltered)
    
    return dfScrubbed

  symbol            created_at  \
0   INTC  2021-03-05T20:51:14Z   
1   INTC  2021-03-05T20:06:56Z   
2   INTC  2021-03-05T19:57:20Z   
3   INTC  2021-03-05T19:52:43Z   
4   INTC  2021-03-05T19:44:47Z   

                                                body followers sentiment  \
0                              common follow ur sibs        48   Bullish   
1                                 ITT ADBE OPTT GLBS       575   Bullish   
2  Should thankful bull market Dow couple reachin...        21   Bullish   
3  ButterFingerDROPs sell October Still probably ...        77      None   
4           Trading easy Buy Short signals real time       162      None   

   raw_compound  compound_bin  sentiment_number  modified_rating modified?  
0        0.0000           0.0               1.0                0        No  
1        0.0000           0.0               1.0                0        No  
2        0.6996           1.0               1.0                0        No  
3        0.0000           0.0 

In [13]:
# compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
#inputs: df - original df; dfs - scrubbed df (stopwords removed)
def compare_scrubbed(df, dfs):
    print(df.iloc[0,2])
    print(dfs.iloc[0,2])

$AMD common follow ur sibs $INTC $MU
common follow ur sibs


In [14]:
#530 Compares the compound_bin[6] (nltk) to the modified_rating[8] (modified_rating is where the "none" or '0.0' rating
#is re-assessed. this was done  to account for tweets where the person had a sentiment but did not bother 
#to input a sentiment)

i = 0
counter = 0

while i < len(dfAPIScrubbed):
    #if dfAPIScrubbed.iloc[i, 6] / 10 - dfAPIScrubbed.iloc[i, 7] != 0:
    if dfAPIScrubbed.iloc[i, 6] - int((dfAPIScrubbed.iloc[i, 8])) != 0: # column 6 is 'compound_bin'; 8 is 'modified_rating'

        #print(i, int(dfAPIScrubbed.iloc[i, 6]), int((dfAPIScrubbed.iloc[i, 8])))
        counter += 1
    i += 1
print('The number of compound_bin scores that are different than the modified_rating:', counter)
print('The total number of records is:', len(dfAPIScrubbed))

The number of compound_bin scores that are different than the modified_rating: 749
The total number of records is: 1291


In [15]:
#540 Pulls up the record of interest; you must enter the index number.

index = int(input('Enter the index number.:')) 

print(dfAPI.iloc[index,2])
print(dfAPIScrubbed.iloc[index,2])

print('Original nltk sentiment:', dfAPI.iloc[index,6] / 10)
print('Scrubbed nltk sentiment:', dfAPIScrubbed.iloc[index,7])

Enter the index number.:100
While yesterday’s market action looks ominous  I would like to remind people that the federal government will approve another 1.9 billion of stimulus  more than $2.5B in 2 months.  The fed will continue to buy treasuries and for that reason you should not care about the 10 year note.  Much of this stimulus will end up back in the market.  The smartmoney have this completely wrong.  ..Despite the huge drop yesterday  I am still up 36% in 2021.  If you can afford to  take advantage of the sales.  Here are my main holdings and the performance since 4th qtr. 2020; not including INTC. ..Buy the fear!..SBH 66.6%.$PRTY 187.7%.$PERI 184.2%.AOSL 163.7%.ACLS 66.2%.$PRPL 44.0%.$FNF 23.3%.$INTC 23.9%
While yesterday ’ market action looks ominous like remind people federal government approve 1.9 billion stimulus 2.5B 2 months fed continue buy treasuries reason care 10 year note Much stimulus end market smartmoney completely wrong .. Despite huge drop yesterday 36 2021 If

In [16]:
# compares the pre-scrubbed body with the post-scrubbed body (stopwords removed)
print(dfAPI.iloc[334,2])
print(dfAPIScrubbed.iloc[334,2])


$HPQ amazing beat $spy $NVDA $INTC
HPQ amazing beat spy


create new model with scrubbed body data (stopwords removed). It can be run with new labels (sentiments) that were generated with nltk Vader or that are from the stocktwits sentiment scores with the "None" sentiment ratings re-evaluated.


In [185]:
#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
# this is only needed if you are going to uses the 'scrubbed_compound' value as the label.

def int_conversion(dfs):
    dfs['scrubbed_compound'] =  np.int64((dfs['scrubbed_compound'] + .05) * 10)

In [186]:
# 550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
# if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
# creates a new column called 'compound_bin' from the raw_compound scores

def bin_sentiment(dfs):
    dfs['scrubbed_compound_bin'] = dfs['scrubbed_compound'] # creates a new column 'scrubbed_compound_bin' (column 11)

    i = 0
    while i < len(dfAPI):
        if dfs.iloc[i,10] >= 0.1: # column 10 is 'scrubbed_compound'
            dfs.iloc[i, 11] =  np.int(dfs.iloc[i, 10] + .9) # column 11 is 'scurbbed_compound_bin'
        
        if dfs.iloc[i,10] < .1 and dfs.iloc[i, 10] > -.1:
            dfs.iloc[i, 11] = 0   
        
        if dfs.iloc[i,10] <= -.1:
            dfs.iloc[i, 11] =  np.int(dfs.iloc[i, 10] - .9)
        i += 1
    
    print(dfs)

     symbol            created_at  \
0      INTC  2021-02-23T18:37:55Z   
1      INTC  2021-02-23T18:10:05Z   
2      INTC  2021-02-23T17:41:45Z   
3      INTC  2021-02-23T17:36:25Z   
4      INTC  2021-02-23T16:55:34Z   
...     ...                   ...   
1275     MU  2021-02-01T20:10:14Z   
1276     MU  2021-02-01T20:07:12Z   
1277     MU  2021-02-01T19:40:37Z   
1278     MU  2021-02-01T19:37:10Z   
1279     MU  2021-02-01T19:25:26Z   

                                                   body followers sentiment  \
0     .. Wow _Great 📈 gains 2490 15 returns A big th...         1      None   
1                        chart looking like tech bubble         1   Bearish   
2     Come peek Quick bet pull trigger couple grand ...        33   Bullish   
3                      added If gets cheaper write puts       261   Bullish   
4     Third Point Sees Enormous Shareholder Value Cr...        12   Bullish   
...                                                 ...       ...       ...   
12

In [117]:
print(dfAPIScrubbed.head())

  symbol            created_at  \
0   INTC  2021-02-23T18:37:55Z   
1   INTC  2021-02-23T18:10:05Z   
2   INTC  2021-02-23T17:41:45Z   
3   INTC  2021-02-23T17:36:25Z   
4   INTC  2021-02-23T16:55:34Z   

                                                body followers sentiment  \
0  .. Wow _Great 📈 gains 2490 15 returns A big th...         1      None   
1                     chart looking like tech bubble         1   Bearish   
2  Come peek Quick bet pull trigger couple grand ...        33   Bullish   
3                   added If gets cheaper write puts       261   Bullish   
4  Third Point Sees Enormous Shareholder Value Cr...        12   Bullish   

   raw_compound  compound_bin  sentiment_number  modified_rating modified?  \
0        0.7243           1.0               0.0              1.0       Yes   
1        0.3612           1.0              -1.0             -1.0        No   
2        0.7184           1.0               1.0              1.0        No   
3        0.0000           

In [192]:
#610
print(dfAPIScrubbed.head())

  symbol            created_at  \
0   INTC  2021-02-23T18:37:55Z   
1   INTC  2021-02-23T18:10:05Z   
2   INTC  2021-02-23T17:41:45Z   
3   INTC  2021-02-23T17:36:25Z   
4   INTC  2021-02-23T16:55:34Z   

                                                body followers sentiment  \
0  .. Wow _Great 📈 gains 2490 15 returns A big th...         1      None   
1                     chart looking like tech bubble         1   Bearish   
2  Come peek Quick bet pull trigger couple grand ...        33   Bullish   
3                   added If gets cheaper write puts       261   Bullish   
4  Third Point Sees Enormous Shareholder Value Cr...        12   Bullish   

   raw_compound  compound_bin  sentiment_number  modified_rating modified?  \
0        0.7243           1.0               0.0              1.0       Yes   
1        0.3612           1.0              -1.0             -1.0        No   
2        0.7184           1.0               1.0              1.0        No   
3        0.0000           

In [44]:
#620


In [22]:
# 650 Loads and combines two different dataframes in dfAPI; this is to combine two input datasets where the 'none'
#values have been modified; this is to see if increased records will increase the accuracy of the model.

def combine_dfs()

    filename1 = "tech stockTwit 03112021 adjusted-Copy1.csv"
    filename2 = "tech stockTwit 02232021 adjusted-Copy1.csv"

    dfAPI1 = getData(filename1)
    dfAPI2 = getData(filename2)

    dfAPI = dfAPI1.append(dfAPI2)

    print('The length of file 1 is:', len(dfAPI1))
    print('The length of file 2 is:', len(dfAPI2))

    print('The length of the combined dataframe is:', len(dfAPI))


The length of file 1 is: 1291
The length of file 2 is: 1280
The length of the combined dataframe is: 2571


'load = input(\'Is this the first time processing the raw stocktwits data (enter "n"/"N" or "y"/"Y")? \')\nif load == \'n\' or load == \'N\' or load == \'no\' or load == \'No\':\n    dfAPI = getData(filename)\n    print(\'Loaded filename:\', filename)\nelse:\n    \n    print(\'ok\')\n    '

In [54]:
# Writes a csv file
#input: df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'

def write_csv(df, filename_output):
    df.to_csv(filename_output, index = False)
    #dfAPI.to_csv(filename_output, index = False)
    print('The csv file was written. File name: ', filename_output)

The csv file was written. File name:  tech stockTwit 03112021 dup advert stopwords.csv
