This program gives the user 3 different cloud source data platforms to choose from:
    finviz
    stocktwits
    reddit

In [1]:
# File name: preprocessingProduction
import pandas as pd
import numpy as np
#import nltk
#from nltk.sentiment.vader import SentimentIntensityAnalyzer
#import nltk.classify
#from nltk import NaiveBayesClassifier
import os
import re
from bs4 import BeautifulSoup
import sys
import time
#analyzer = SentimentIntensityAnalyzer()

#from nltk.corpus import stopwords

************************
Table of Contents
#10* initializes the dataframe "df" and imports the csv into df; 
#20* calls getdata to import the csv into the dataframe, 'dfAPI'
#30 removes any duplicate records; duplicate records imply bot records
#40 finds certain words in the strings ('body') and deletes the entire record.  
#50* Vader sentiment analyzer
#60* creates a new column called 'compound_bin' from the raw_compound scores
#70* converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
#80* Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
#90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
#100 counts how many "None" sentiment values are there for the stocktwits sentiment value
#110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#115 Provides statistics on sentiments; bullish, none or bearish.
#120 Allows user to manually input value when stocktwits sentiment value is "None"
#130 Loads a csv file into the df dfAPI and print out the first 21 records
#140 This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
the accuracy number 
#440 sets up stopword removal; returns stopWords
#470 creates a list of new stopwords and then adds them to the set provided by nltk
Note: it is case sensitive; Input is the nltk stopword list ("stopWords")
#490 Checks to see of the words were removed from the stopWords list.
inputs: stopword list: output from def remove_from_stopwords(sw); the word to be removed
#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
It then joins the individual words back into a string.
There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
dfScrubbed = df #This is a shallow copy
#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
#550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
creates a new column called 'compound_bin' from the raw_compound scores
#640 compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
inputs: df - original df; dfs - scrubbed df (stopwords removed)
#650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
values have been modified; this is to see if increased records will increase the accuracy of the model.
#660 Writes a csv file

METHODS

In [2]:
# methods

class SentimentAnalysisPreprocessing():
    
    def __init__(self, df):
        self.df = df
        
    # 10 initializes the dataframe "df" and imports the csv into df; 
    # the argument is the name/address of the file.
    # https://stackoverflow.com/questions/33440805/pandas-dataframe-read-csv-on-bad-data
    def getData(self, name):
        df1 = pd.DataFrame() # defines df1 as a dataframe
        df1 = pd.read_csv(name, header = 0)
        return df1

    # removes duplicate headers
    def remove_duplicate_headers(self, df):
        print('\nDropping duplicate headers ...')
        column = 'symbol'
        %time df.drop(df[df['symbol'] == column].index, inplace=True)
        df = df.reset_index(drop = True) # resets the index
        return df

    # 30 removes any duplicate records; duplicate records imply bot records
    def remove_duplicates(self, df):
        print('\nDropping duplicates ...')
        %time df = df.drop_duplicates()
        df = df.reset_index(drop = True) # resets the index
        len(df)
        return df

    # remove HTTP tags
    def remove_http_tags(self, df):
        print('\nRemoving http tags ...')
        %time df['body_processed'] = df['body'].map(lambda x : ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))
        return df

    # coverts to all lower case
    def lower_case(self, df):
        print('\nConverting to lower case ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x: x.lower())
        return df

    # removes all punctuation
    def remove_punctuation(self, df):
        print('\nRemoving punctuation ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x: re.sub(r'[^\w\s]', '', x))
        return df

    # removes unicodes (emojis)
    def remove_unicode(self, df):
        print('\nRemoving unicode ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
        return df

    def lemmatize(self, df, stop_words): #lemmer must be defined outside of the function and passed in
        print('\nLemmatizing ...')
        
        import nltk
        import re
        from bs4 import BeautifulSoup
        from nltk.stem import WordNetLemmatizer
        
        #from nltk.corpus import stopwords
        #stop_words = stopwords.words('english')
        #nltk.download('wordnet') #not in original code

        # Lemmatize the text
        lemmer = WordNetLemmatizer()
        
        %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

        return df

    # Remove stopwords
    def remove_stopwords(self, df, stop_words): #stop_words must be defined outside of the function and passed in
        print('\nRemoving stopwords ...')

        #adds new stopwords to list

        newStopWords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
        newStopWords += ['again', 'against', 'all', 'almost', 'alone', 'along']
        newStopWords += ['already', 'also', 'although', 'always', 'am', 'among']
        newStopWords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
        newStopWords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
        newStopWords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
        newStopWords += ['because', 'become', 'becomes', 'becoming', 'been']
        newStopWords += ['before', 'beforehand', 'behind', 'being', 'below']
        newStopWords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
        newStopWords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
        newStopWords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
        newStopWords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
        newStopWords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
        newStopWords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
        newStopWords += ['every', 'everyone', 'everything', 'everywhere', 'except']
        newStopWords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
        newStopWords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
        newStopWords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
        newStopWords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
        newStopWords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
        newStopWords += ['herself', 'him', 'himself', 'his', 'how', 'however']
        newStopWords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
        newStopWords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
        newStopWords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
        newStopWords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
        newStopWords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
        newStopWords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
        newStopWords += ['nevertheless', 'next', 'nine', 'nobody', 'none'] #removed 'no'
        newStopWords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
        newStopWords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
        newStopWords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
        newStopWords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
        newStopWords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
        newStopWords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
        newStopWords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
        newStopWords += ['some', 'somehow', 'someone', 'something', 'sometime']
        newStopWords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
        newStopWords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
        newStopWords += ['then', 'thence', 'there', 'thereafter', 'thereby']
        newStopWords += ['therefore', 'therein', 'thereupon', 'these', 'they']
        newStopWords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
        newStopWords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
        newStopWords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
        newStopWords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
        newStopWords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
        newStopWords += ['whatever', 'when', 'whence', 'whenever', 'where']
        newStopWords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
        newStopWords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
        newStopWords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
        newStopWords += ['within', 'without', 'would', 'yet', 'you', 'your']
        newStopWords += ['yours', 'yourself', 'yourselves'] #provided by Codecademy??

        # additional stopwords:
        newStopWords += ['[screenshot]', 'screenshot', '[screenshot]great', 'screenshot',
                         'the', 'smart', 'yah', 'got', 'nutty', 'moving', 'weeks', 'Got', 'So', 'today', 'Been', 'or']

        newStopWords += ['i', 'you', 'He', 'he', 'she', 'they', 'their', 'it'] # pronouns

        newStopWords += ['amd','nvda', 'tsla', 'goog', 'ba', 'fb', 'googl', 'intc', 'intel', 'csco', 'mu',
                         'smh', 'tsm','aapl', 'csco', 'poetf', 'photonics', 'dd', 'arwr', 't', 'infini', 'amc', 'arl',
                         'gme', 'nio', 'qs', 'msft', 'adbe', 'unh'] # Stock symbols or names

        newStopWords += [] # nouns

        #newStopWords += ['.', '?', '!', ';', ',', "'", '.'] # punctuation

        newStopWords += ['&', '#', '%', '$', '@', '/'] # symbols

        newStopWords += ['41.75', '530.05', '39', 'Two', 'two', 'One', 'one', 'Three', 'three', 'Four', 'four',
                        'Five', 'five', 'Six', 'six', 'Seven', 'seven', 'Eight', 'eight', 'Nine', 'nine', 'Ten',
                        'ten', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '39', ' 270',
                          '270000', '4033477', '244', '16', '399', '800', '270', '000', '60', '74',
                          '1600', '993', '392', '98', '00', '1601'] # numbers

        for w in newStopWords:
            stop_words.append(w)

        #print('stop_words: ', stop_words)

        #removes the stopwords from the column body_processed
        %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

        return df

    # 40 finds certain words in the strings ('body') and deletes the entire record.
    #Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
    #that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
    #is decremented by one.
    #Also, the filtering terms are not case sensitive.
    def filter_records(self, df):
        import fnmatch

        data = []
        counter = 0
        advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', 
                  '*SweepCast*', '*Large Print*', '*Huge Print*', '*8-K*', 
                  '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
                  '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', 
                  '*New Article*', '*ooc.bz*', '*http*', 'Huge Trade', 'Trading is easy', 
                  'www.', '#wallstreetbets', 'wallstreetbets', 'Huge Trade', '#unitedtraders', 
                  'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

        for a in advert:
            i = 0
            df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
            while i < len(df):
                dat = df.loc[i, ('body')] # 2 represents the 'body' column
                data = [dat] # sets the string from the df into a list for the fnmatch.filter
                #print('index = ', i)
                filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
                #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

                if len(filtered) != 0: #if returns a True then record needs to be removed
                    counter += 1

                    df = df.drop(df.index[i]) # drops (deletes) the record
                    df = df.reset_index(drop = True) # resets the index; removes the gaps   

                    #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)

                    #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                    #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                    #the record that was just checked. That is why it takes multiple runs to remove all of the target
                    #records. To correct this decrement the index, i, by

                    i -= 1

                i += 1

        df = df.reset_index(drop = True) # resets the index; removes the gaps   
        len(df)
        return df


    # 120 Allows user to manually input value when stocktwits sentiment value is "None"
    # It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
    # it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
    # upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
    # the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
    # it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"
    def edit(self, df):

        import copy

        i = 0
        counter = 0    # counter to see if user want to stop

        while i < len(df):
        #while i < 6:

            if df.loc[i,('sentiment')] == 'None' and df.loc[i,('modified?')] == 'No': # Column 9 is 'modified?'
                print('\nindex number:', i, '\n', df.loc[i, ('body')])
                #print('This is the body of the tweet:\n', df..log[i,('body')])
                rating = int(input('Enter your rating (1, 0 or -1.):')) 
                df.loc[i,('modified_rating')] = copy.deepcopy(rating) # writes inputed number to the 'modified_rating'
                df.loc[i,('modified?')] = 'Yes' # sets "modified?" equal to 'Yes' to identify which records have been modified; so that it can start at the next record at start up

                counter += 1

            elif df.loc[i,('sentiment')] == 'Bearish':

                df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)

            elif df.loc[i,('sentiment')] == 'Bullish':

                df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)

            if counter == 20: # represents 20 edits
                quit = input('Do you want to quit? (Enter either a "y" or "Y") ')
                if quit == 'y' or quit == 'Y':
                    print('You are exiting.')
                    break
                else:
                    counter = 0 # resets the counter to 0 so there must be another 20 records reviewed and modified 

            i += 1

        #df.to_csv(filename, index = False)
        #print('The csv file was written. File name: ', filename)

        return df

    #480 This removes words from the list of stopwords and writes list to csv file
    # https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python#:~:text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
    #new_words = list(filter(lambda w: w not in stop_words, initial_words))
    def remove_from_stopwords(self, sw, relevant_path):
        WordsToBeRem = ['no']
        stopWords = list(filter(lambda w: w not in WordsToBeRem, sw)) #It will retain anyword in sw that is not in WordsToBeRemoved

        #converts the stopword list to a df so that it can then be written to a csv file
        df_stopwords = pd.DataFrame(stopWords, columns = ['stopwords'])
        name_of_csv_file = relevant_path + '/' + 'stopwords.csv'
        df_stopwords.to_csv(name_of_csv_file, index = False) #writes stopwords to csv file

        #print(stopWords)

        return stopWords

    #490 Checks to see of the words were removed from the stopWords list.
    #inputs: stopword list (sw) and the word to be removed from the so (WordToBeRem):
    def check_stopwords(self, sw, WordToBeRem):

        r = 0

        for w in sw:
            #print(w)
            if w == WordToBeRem:
                print('The word ', w , ' is still in the stopWords list!')
                r += 1

        if r == 0:
            print('It did remove the words from the stopWords list!')

        #print(len(stopWords))

    #510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
    # the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
    # match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
    # It then joins the individual words back into a string.
    #There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
    # separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
    #dfScrubbed = df #This is a shallow copy
    def rem_stopwords(self, df, stopWords):

        from nltk.tokenize import sent_tokenize, word_tokenize

        dfScrubbed = df.copy() #This is a deep copy. df.copy(deep = True); deep = True is default

        i = 0
        while i < len(df):

            data = df.loc[i,('body')]
            words = word_tokenize(data) # separates the string into a individual words.
            wordsFiltered = []

            for w in words:
                if w not in stopWords:
                    wordsFiltered.append(w) # makes a new word list without the stopwords

            joinedWordsFiltered = ' '.join(wordsFiltered)

            dfScrubbed.loc[i,('body')] = joinedWordsFiltered # replaces the recorded in dfScrubbed with the stopWords removed
            # from the 'body'

            i += 1

        #print(wordsFiltered)

        #### method removes empty body rows and reindexes
        dfScrubbed = remove_empty_body_rows(dfScrubbed)

        #### checks to see if there are any empty records left
        print('Are there any empty body records?')
        empty = np.where(pd.isnull(dfScrubbed['body'])) #checks to see if there are any empty records in the column 'body'
        print(empty)

        #print(dfScrubbed.head())

        return dfScrubbed

       #640 compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
    #inputs: df - original df; dfs - scrubbed df (stopwords removed)
    def compare_scrubbed(self, df, dfs):
        print(df.loc[0,('body')])
        print(dfs.loc[0,('body')])

    # 650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
    #values have been modified; this is to see if increased records will increase the accuracy of the model.
    def combine_dfs(self, df1, df2):

        df = df1.append(df2)

        print('The length of file 1 is:', len(df1))
        print('The length of file 2 is:', len(df2))

        print('The length of the combined dataframe is:', len(df))

        return df

    # 660 Writes a csv file
    #input: df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'
    def write_csv(self, df, filename_output, relevant_path):

        df.to_csv(relevant_path + '/' + filename_output, index = False, encoding = 'utf-8')
        print('The csv file was written. File name: ', filename_output)

    # displays a list of file with on a csv suffix       
    def list_dir_files(self, relevant_path):
        # https://clay-atlas.com/us/blog/2019/10/27/python-english-tutorial-solved-unicodeescape-error-escape-syntaxerror/?doing_wp_cron=1618286551.1528689861297607421875
        #need to change \ to /

        import os

        included_extensions = ['csv']
        file_names = [fn for fn in os.listdir(relevant_path) # uses os.listdir to display only .csv files
                  if any(fn.endswith(ext) for ext in included_extensions)]

        print('Path: ', relevant_path)

        for f in file_names:
            print(f)

    # removes specific rows and resets the index
    def remove_empty_body_rows(self, df):
        df.dropna(subset=['body'], inplace=True) #drops empty body records
        df = df.reset_index(drop = True) # resets the index
        return df

    #### checks to see if there are any empty records left
    def empty_records_check(self, df):
        print('Are there any empty body records?')
        empty = np.where(pd.isnull(df['body'])) #checks to see if there are any empty records in the column 'body'

        if empty[0].size == 0:
            print('There are no empty records! \n', empty)
        else:
            print('There are empty records ...\n', empty)

    #### Removes Imogis
    def remove_emoji(self, string):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)

    # combines both names of file wanted to combing and writes csv file
    def combine_two_files(self):
        first_filename = input()

    def rem_dup_adver_ever_oth_emoji(self, df):
        #remove duplicates
        r_d = input('Do you want to remove duplicates? [Press enter if no] ')
        if r_d in yes_resp:
            df = remove_duplicates(df) #return df; removes duplicates
            remove_dupl = 'r_d '
        else:
            remove_dupl = ''

        #remove advertisements
        r_a = input('Do you want to remove advertisements? [Press enter if no] ')
        if r_a in yes_resp:
            df = filter_records(df) #returns df; removes addvertisements
            remove_advertisements = 'r_a '
        else:
            remove_advertisements = ''

        # remove emojis
        r_emoj = input('Do you want to remove emojis from the body records: [Press enter if no] ')
        if r_emoj in yes_resp:
            #print('location1')
            i = 0
            #print('location2')
            while i < len(df):
                #print('location3', i)
                string = df.loc[i, ('body')]
                #print('location4')
                #print('original string: ', string)
                new_string = remove_emoji(string)
                #print('location5')
                #print('new string: ', new_string)
                df.loc[i, ('body')] = new_string
                #print(df['body'][i])

                r_emoji = 'r_emoj '

                i += 1
        else:
            r_emoji = ''

        return df, r_emoji, rem_every_other, remove_advertisements, remove_dupl


In [None]:
##############################################################
#### MAIN                                                 ####
##############################################################

#Subreddit to query
# data will return the number of submissions or posts in the subreddit.
# query is a key word to be searched on in the subreddit
# after is the starting date for the search
# before is the ending date for the search
# sub is the subreddit

query = input('What stock symbol do you want to see? ')

sub = 'wallstreetbets'

unix_time = int(time.time()) # today's date in unix time
before = unix_time
#after = int(before - (60 * 60 * 24 * 7)) # one weeks prior to current date

days = int(input('How many days back do you want to go? [Enter an integer between 1 and 14.] \n'))

after = int(before - (60 * 60 * 24 * days)) # one weeks prior to current date

subCount = 0
subStats = {}

# modified by PH; added
commentStats = {} # dictionary to hold comments

%pwd # set path to present working directory

data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered from the 'after' date up until before date

# while there is at lease one post the 
while len(data) > 0:
    for submission in data:
        # collectSubData collects the:
        collectSubData(submission)
        
        # "subCount" is the number of posts in a subreddit
        subCount+=1
    
    # Calls getPushshiftData() with the created date of the last submission
    
    # modified by PH
    #print('length = ',len(data))
    #print("submissions returned in getPushshiftData dictionary:" + str(subCount))
    #print("len dictonary returned in call:"+str(len(data)))
    
    #print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
    print("created timestamp:"+str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))

    #end
    
    after = data[-1]['created_utc']
    
    data = getPushshiftData(query, after, before, sub)
    
updateSubs_file()
df = updateComs_file() #writes a csv file and also converts data to a df

# modified by PH    

#print("after while loop len data:"+str(len(data)))
#end


In [4]:
!pip install finVizFetchPkg

Collecting finVizFetchPkg
  Downloading https://files.pythonhosted.org/packages/25/ec/92b3ea22d11da1242672815c751b4b8f771d3a7524a4e539234cc1989f86/finVizFetchPkg-0.0.2-py3-none-any.whl
Installing collected packages: finVizFetchPkg
Successfully installed finVizFetchPkg-0.0.2


In [5]:
import finVizFetchPkg.finVizScaper
finvizdf = finVizFetchPkg.finVizScaper.finvizStreamer() #comment out to run in notebook
df = finvizdf.scrape_finziz()

What is the stock symbol? (Please enter only one.)v
ticker inside for loop:  v
request url:https://www.finviz.com/quote.ashx?t=v




  html = BeautifulSoup(response, 'html')


In [6]:
display(df)

Unnamed: 0,ticker,date,time,title,link
0,v,Sep-08-21,12:06AM,PayPal to Buy Japanese Unicorn Paidy in $2.7 B...,https://finance.yahoo.com/news/paypal-acquire-...
1,v,Sep-06-21,06:06AM,3 No-Brainer Stocks to Invest $300 in Right Now,https://www.fool.com/investing/2021/09/06/3-no...
2,v,Sep-05-21,01:19PM,"While Visas (NYSE:V) Stock Price Appears Weak,...",https://finance.yahoo.com/news/while-visa-nyse...
3,v,Sep-04-21,09:26AM,10 Financial Services Dividend Champions to Bu...,https://finance.yahoo.com/news/10-financial-se...
4,v,Sep-03-21,06:20PM,NFT marketplace OpenSea records $3.4 billion t...,https://www.marketwatch.com/story/nft-marketpl...
...,...,...,...,...,...
95,v,Jul-21-21,10:03AM,Schwab (SCHW) Option Traders Betting on Bounce...,https://www.investopedia.com/schwab-schw-optio...
96,v,Jul-21-21,06:59AM,Meet Visa: Reintroducing the Iconic Visa Brand...,https://finance.yahoo.com/news/meet-visa-reint...
97,v,Jul-21-21,05:51AM,"Got $5,000? 5 Brand-Name Stocks That'll Make Y...",https://www.fool.com/investing/2021/07/21/got-...
98,v,Jul-20-21,05:45PM,Visa (V) Gains But Lags Market: What You Shoul...,https://finance.yahoo.com/news/visa-v-gains-la...


MAIN

In [None]:
#######################################################
####                     MAIN                      ####
#######################################################

yes_resp = ['yes', 'YES', 'y', 'Y', 'Yes']
no_resp = ['no', 'NO', 'n', 'N', 'No']

#need to accommodate for three different types of inputs. 
#         - finviz
#         - stocktwits
#         - reddit

# step one is to either scrape or parse the data into a dataframe, df.
#############
# Finviz scraper
#############
#finvizdf = finvizStreamer()
import finVizFetchPkg()
finvizdf = finVizFetchPkg.finVizScaper.finvizStreamer() #comment out to run in notebook
df = finvizdf.scrape_finziz()

#############
# stocktwits API
#############


#############
# reddit API
#############

# step two is to convert each of the dataframes into a common format with the same column names (data)





if finviz_resp in yes_resp:
    df.rename(columns = {'ticker':'symbol', 'title':'body', 'Sentiment':'sentiment'}, inplace = True) #renames the columns to match the stocktwits names
    
####################################    
'''finviz output columns:
ticker,date,time,title,Sentiment

stocktwits parser output columsn:
symbol,messageID ,created_at,body,followers,sentiment,date,time

ticker = symbol
title = body
Sentiment = sentiment
date = date
time = time

"messageID", "created_at", "followers" do not exist in scraped finviz csv '''

######################################

#step 3 is to scrub the df to optimize the natural Language sentiment classification model development and accuracy

df = remove_duplicate_headers(df)

df = remove_duplicates(df)

df = remove_http_tags(df)

df = remove_punctuation(df)

df = remove_unicode(df)

df = lower_case(df)

#df = convert_sentiment_to_numerical(df) #line 240

#df = compound_binning(df) #line 210

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df = remove_stopwords(df, stop_words)

lemmer = WordNetLemmatizer()
df = lemmatize(df, lemmer)


#### checks to see if there are any empty records
print('Test empty records before writing the csv file')
empty_records_check(df)

df = remove_empty_body_rows(df)

# Writes a csv file; input  df that is to be saved as a csv; output file name is combination of types of editing
w_csv = input('Do you want to write a csv file? [Press enter if no] ')
if w_csv in yes_resp:
    new_name = name.replace('.csv', '') #removes the .csv from the input file's name
    print(new_name)
    processed = 'preprocessed'
    # creates a file name that is a combination of all the different scrubbing types
    #filename_output = processed + remove_dupl + remove_advertisements + rem_every_other + swords + ed + r_emoji + vader_run + name
    filename_output = new_name + '_' + processed + '_lemmatized.csv'

    if name == filename_output: #Checks to see if the file already exists
        os.remove(filename_output) #If the file already exists it deletes the original file
        print('The old file was deleted.\n')
    
    write_csv(df, filename_output, relevant_path) #Writes the df to a new file
    print('The file was written with the filename of: ', filename_output, '\n')

    # NOTE TO SELF - When there is a record that has spaces only, it is encoded as a 'NaN' or empty record
    #when encoded as a utf-8 csv file. It will cause the postprocessing Vader app to crash. Importing the csv file
    #and then removing the 'NaN' and then rewriting the csv file should take care of the problem.

    final_name =  relevant_path + '/' + filename_output
    print('The filename is: \n', final_name)
    dftest = getData(final_name)
    print('csv file read into df to see if all of the empty records are removed.')
    empty_records_check(dftest)
    df_final = remove_empty_body_rows(dftest)
    empty_records_check(df_final)

    os.remove(final_name) #If the file already exists it deletes the original file
    write_csv(df_final, filename_output, relevant_path) #Writes the df to a new file

# combines two dfs
c_t_dfs = input('Do you want to combine two files? [Press enter if no] ')
if c_t_dfs in yes_resp:
    
    print('Here is a list of the csv files to choose from: \n')
    list_dir_files(relevant_path)
    first_name = input('\nWhat is the first file you want to combine? ')
    df = getData(relevant_path + '/' + first_name) #returns df; reads csv file into df
    print('Imported the csv file.')
    
    second_name = input('What is the second file you want to add? ')
    df2 = getData(relevant_path + '/' + second_name)

    # 650 Loads and combines two different dataframes in dfAPI; this is to combine two input datasets where the 'none'
    #values have been modified; this is to see if increased records will increase the accuracy of the model.
    df = combine_dfs(df1, df2)
    
    w_csv = input('Do you want to write a csv file? [Press enter if no] ')
    if w_csv in yes_resp:
        first_name_no_csv = first_name.replace('.csv', ' + ') 
        duo_name = first_name_no_csv + second
        write_csv(df, duo_name, relevant_path, encoding = 'UNICODE') #Writes the df to a new file
        print('The file was written with the filename of: ', duo_name, '\n')


print('\nAll done ....')

In [None]:
print(df.columns)

In [None]:
display(df.head())

In [None]:
testname = 'test_name.csv'
new_name = testname.replace('.csv', '') #removes the .csv from the input file's name
new_name = new_name + '_test.csv'
print(new_name)

In [None]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
def test(df):
    print('\nLemmatizing ...')
    
    import nltk
    import re
    from bs4 import BeautifulSoup
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    stop_words = stopwords.words('english')
    nltk.download('wordnet') #not in original code
    
    # Lemmatize the text
    lemmer = WordNetLemmatizer()
    %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

    return df


In [None]:

print('\nLemmatizing ...')
    
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
nltk.download('wordnet') #not in original code
    
# Lemmatize the text
lemmer = WordNetLemmatizer()
%time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))



In [None]:
df = test(df)

In [None]:
def lemmatize(df):
    print('\nLemmatizing ...')
    
    import nltk
    import re
    from bs4 import BeautifulSoup
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    stop_words = stopwords.words('english')
    nltk.download('wordnet') #not in original code
    
    # Lemmatize the text
    lemmer = WordNetLemmatizer()
    %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

    return df

In [None]:
df = lemmatize(df)

In [None]:
def test(df):

    import nltk
    import re
    from bs4 import BeautifulSoup
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords

    stop_words = stopwords.words('english')

    #adds new stopwords to list
    new_stop_words = ['intc', 'nvda', 'tsla', 'mu', 'msft', 'tsm', 'adbe', 'unh', '39', ' 270',
                      '270000', '4033477', '244', '16', '399', '800', '270', '000', '60', '74',
                      '1600', '993', '392', '98', '00', '1601', 'amd', 'aapl']
    for w in new_stop_words:
        stop_words.append(w)

    print('stop_words: ', stop_words)

    #removes the stopwords from the column body_processed
    %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

    return df

In [None]:
df = test(df)
display(df)

In [None]:
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

#adds new stopwords to list
new_stop_words = ['intc', 'nvda', 'tsla', 'mu', 'msft', 'tsm', 'adbe', 'unh', '39', ' 270',
                      '270000', '4033477', '244', '16', '399', '800', '270', '000', '60', '74',
                      '1600', '993', '392', '98', '00', '1601', 'amd', 'aapl']
for w in new_stop_words:
    stop_words.append(w)

print('stop_words: ', stop_words)

#removes the stopwords from the column body_processed
%time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))


In [None]:
display(df)

In [None]:
dftest = getData('preprocessed tech stockTwit 03112021.csv')
print(dftest.head())

In [None]:
#df = df.reset_index(drop = True)
print(df)

In [None]:
# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
none_count_raw(df) 


In [None]:
yes_resp = ['yes', 'YES', 'y', 'Y', 'Yes']
no_resp = ['no', 'NO', 'n', 'N', 'No']

test = input('do you want to test? ')
if test in yes_resp:
    print('yes I do')

In [None]:
name1 = 'output.csv'
remove_dupl = 'a '
remove_advertisements = 'b '
remove_every_other = ''
ed = 'd '

filename_output = remove_dupl + remove_advertisements + remove_every_other + ed + name1

print(filename_output)

In [None]:
#how to determine if column exists
import pandas as pd
 
df = pd.DataFrame([[10, 20, 30, 40], [7, 14, 21, 28], [55, 15, 8, 12]],
                  columns=['Apple', 'Orange', 'Banana', 'Pear'],
                  index=['Basket1', 'Basket2', 'Basket3'])
 
if 'apple' not in df.columns:
    print("in - no")
else:
    print("notin - yes")
 
 
if set(['Apple','Orange']).issubset(df.columns):
    print("Yes")
else:
    print("No")

In [None]:

def remove_duplicate_headers(df):
    column = 'symbol'
    df.drop(df[df['symbol'] == column].index, inplace=True)
    
    return df

print('Here is a list of the csv files to choose from: \n')
list_dir_files()
name = input('\nWhat file do you want to use? ')
df = getData(name) #returns df; reads csv file into df
print('Imported the csv file.')

print(df.head())

i = 0
while i < len(df):
    if df.iloc[i , 0] == "symbol":
        print('The index is: ', i)
    i += 1

print('starting to remove headers')
df = remove_duplicate_headers(df)
print('done removing headers')


i = 0
while i < len(df):
    if df.iloc[i , 0] == "symbol":
        print('The index is: ', i)
    i += 1
else:
    print('They are all gone!')
    
print(df.head())


In [None]:
import os

def remove_duplicate_headers(df):
    column = 'symbol'
    df.drop(df[df['symbol'] == column].index, inplace=True)
    
    return df

relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Post Processing'
included_extensions = ['csv']
file_names = [fn for fn in os.listdir(relevant_path)
              if any(fn.endswith(ext) for ext in included_extensions)]

for f in file_names:
    print(f)
    
name = input('What file do you want: ')
df = getData(relevant_path + '/' + name)

print(df.head(120))

print('before:')
empty = np.where(pd.isnull(df['body']))
print('empty')

df = remove_duplicate_headers(df)

df = df.fillna(value ={'body':' '}) #replaces any empty 'body' records with a space

print('after:')
np.where(pd.isnull(df['body']))

print(df.head(120))



In [None]:
# finding and removing empty records in a df
dftest = getData(relevant_path + '/' + filename_output)

print(relevant_path + '/' + filename_output)
print('csv file read into df to see if all of the empty records are removed.')

#finds empty records
empty = np.where(pd.isnull(dftest['body'])) #checks to see if there are any empty records in the column 'body'; empty is a tuple where the first element is the array, the second is dtype of the array
if empty[0].size == 0:
    print('There are no empty records: \n', empty)
else:
    print('There are empty records: \n', empty, '\n')
    
print(dftest.iloc[110:125,])

#drops empty records
dftest.dropna(subset=['body'], inplace=True) #drops empty body records
dftest = dftest.reset_index(drop = True) # resets the index

empty = np.where(pd.isnull(dftest['body'])) #checks to see if there are any empty records in the column 'body'; empty is a tuple where the first element is the array, the second is dtype of the array
print('\nAFTER DROP: \n', empty, '\n')

print(dftest.iloc[110:125,])

# removes specific rows and resets the index
def remove_empty_body_rows(df):
    df.dropna(subset=['body'], inplace=True) #drops empty body records
    df = df.reset_index(drop = True) # resets the index
    return df




In [None]:
print(df['body'][10:20])

In [None]:
#removes emojis 

# Inports the csv file of choice
relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files'

print('Here is a list of the csv files to choose from: \n')
list_dir_files(relevant_path)
name = input('\nWhat file do you want to use? ')
df = getData(relevant_path + '/' + name) #returns df; reads csv file into df
print('Imported the csv file.')



def remove_emoji(string):
    import re
    import sys
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

i = 0

yes_dec = ['yes', 'y']

decision = input('decide: ')

if decision in yes_dec:
    i = 0
    while i < len(df):
        string = df.loc[i, ('body')]
        #print('original string: ', string)
        new_string = remove_emoji(string)
        #print('new string: ', new_string)
        df.loc[i, ('body')] = new_string
        #print(df['body'][i])

        i += 1
        
print('all done')

print(df.loc[13,'body'])


In [None]:
print(df.loc[13,'body'])

In [None]:
i = 0

while i < 2:
    #string = df.iloc[i,2]
    string = df.loc[i, ('body')]
    
    dat = df.loc[i, ('body')] 
    data = [dat] # 
    
    print('original string: ', dat)
    print(data)
    i += 1
    
    

In [None]:
#manipulating two names and then adding them together

first = 'first.csv'
second = 'second.csv'
first_no_csv = first.replace('.csv', ' + ') 

first_second = first_no_csv + second
print(first_second)

In [None]:
negative = -1.9
rounding = int(negative)
print(rounding)

In [None]:
# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.
def filter_records(df):
    import fnmatch

    data = []
    counter = 0
    advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

    for a in advert:
        i = 0
        df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
        while i < len(df):
            dat = df.loc[i, ('body')] # 2 represents the 'body' column
            data = [dat] # sets the string from the df into a list for the fnmatch.filter
            #print('index = ', i)
            filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
            #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

            if len(filtered) != 0: #if returns a True then record needs to be removed
                counter += 1
            
                df = df.drop(df.index[i]) # drops (deletes) the record
            
                #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)
                
                #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                #the record that was just checked. That is why it takes multiple runs to remove all of the target
                #records. To correct this decrement the index, i, by
                
                i -= 1
    
            i += 1

    df = df.reset_index(drop = True) # resets the index; removes the gaps   
    len(df)
    return df

df = filter_records(df)


In [None]:
# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.

import fnmatch
df = df.reset_index(drop = True) # resets the index; removes the gaps  
data = []
counter = 0
advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

for a in advert:
    i = 0
    df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
    while i < len(df):
        dat = df.loc[i, ('body')] # 2 represents the 'body' column
        #print('index =', i)
        #print(dat)
        #print(a)
        data = [dat] # sets the string from the df into a list for the fnmatch.filter
        #print('index = ', i)
        filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
        #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

        if len(filtered) != 0: #if returns a True then record needs to be removed
            counter += 1
            
            df = df.drop(df.index[i]) # drops (deletes) the record
            df = df.reset_index(drop = True) # resets the index; removes the gaps   
            #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)
                
            #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
            #5 becomes index number 4. Since the counter increments one more time it skips the record right after
            #the record that was just checked. That is why it takes multiple runs to remove all of the target
            #records. To correct this decrement the index, i, by
                
            i -= 1
   
        i += 1

df = df.reset_index(drop = True) # resets the index; removes the gaps   
len(df)



In [None]:
print(df.loc[340:350,:])

In [None]:
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 0
    counter = 0
    df = df.reset_index(drop = True) #resets the index to be continuous 

    while i < len(df):
        print('index =', i, i % 2, df.loc[i,('sentiment')])

        if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                df.drop(df.index[i]) #drops (deletes) the record
                print('index =', i, df.loc[i,('sentiment')])
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'None':
            sentiment_number += 1
        i += 1

    print('\nThe total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '% \n')
            
    return df    

remove_every_other(df)

In [None]:
print(df.loc[0:10, :])

In [None]:
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 1
    counter = 0
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    print(len(df))

    while i < len(df):
        print('index =', i, i % 2, df.loc[i,('sentiment')])

        if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                print('inside :',i, i % 2)
                print(df.loc[i], '\n right before drop')
                df = df.drop(df.index[i]) #drops (deletes) the record
                df = df.reset_index(drop = True) #resets the index to be continuous 

                #df.drop([i]) #drops (deletes) the record
                print('index =', i, df.loc[i,('sentiment')])
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    print(len(df))
    
    return df    

remove_every_other(df)

In [None]:
len(df)

In [None]:
i = 0
df = df.drop(df.index[i]) #drops (deletes) the record
#print('index =', i, df.loc[i,('sentiment')])

print(df.loc[0:15,:])


In [None]:
#Create a DataFrame

import pandas as pd
import numpy as np

d = { 'Name':['Alisa','raghu','jodha','jodha','raghu','Cathrine', 'Alisa','Bobby','Bobby','Alisa','raghu','Cathrine'],
     'Age':[26,23,23,23,23,24,26,24,22,26,23,24], 
     'Score':[85,31,55,55,31,77,85,63,42,85,31,np.nan]}

df = pd.DataFrame(d,columns=['Name','Age','Score'])

df

df.drop([1,2])

df

In [None]:
yes_resp = ['yes', 'YES', 'y', 'Y', 'Yes']
no_resp = ['no', 'NO', 'n', 'N', 'No']

finviz_resp = input('Is this a file from scraping Finviz? ')

if finviz_resp in yes_resp:
    print('It is in there')
else:
    print('It is not in there')
    
    

In [None]:
'''if 'raw_compound' not in df.columns:
    before_scrubbing = input('Do you want to run the Vader analysis before scrubbing? \n')
    if before_scrubbing in yes_resp:
        vader_run = 'v_b '
        print('\nThis is the first time this file has been preprocessed.\n')
        print('Performing Vader sentiment analysis before scrubbing... \n')
    
        df = vader_analysis(df)
        
        df, r_emoji, rem_every_other, remove_advertisements, remove_dupl = rem_dup_adver_ever_oth_emoji(df)
        df, swords = remove_stopwords(df)
        
    else:
        vader_run = 'v_a '
        print('\n Performing Vader sentiment analysis after scrubbing... \n')
        
        df, r_emoji, rem_every_other, remove_advertisements, remove_dupl = rem_dup_adver_ever_oth_emoji(df)
        df, swords = remove_stopwords(df)
    
        df = vader_analysis(df)   
else:
    print('\nThis file has been preprocessed before. There is no need to run the VADER analysis.\n')
'''

'''# 90 OPTIONAL Compares the Vader sentiment numbers with the Stocktwits sentiment ratings.
v_c = input('Do you want to compare the Vader sentiment numbers with the Stocktwits sentiment ratings? [Press enter if no] ')
if v_c in yes_resp:
        if 'raw_compound' in df.columns: #checks to see if this file have been prepocessed before by seeing if the column 'raw_compond' exists
            vader_correct(df) 

# 100 OPTIONAL: Counts how many "None" sentiment values are there for the stocktwits sentiment value
c_n_s = input('Do you want to count the "None" sentiment values for the Stocktwits sentiments before any edits? [Press enter if no] ')
if c_n_s in yes_resp:
    none_count_raw(df) 

# 115 OPTIONAL: Provides statistics on Stocktwits sentiments; bullish, none or bearish.
s_o_s = input('Do you want to see the statistics on the Stocktwits sentiments? [Press enter if no] ')
if s_o_s in yes_resp:
    stats(df) 

# 120 OPTIONAL: Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"

e_n = input('Do you want to edit the "None" records? [Press enter if no] ')
if e_n in yes_resp:
    df = edit(df) #returns df
    ed = 'edited '
else:
    ed = ''

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment values after the edit
n_r_a_e = input('Do you want to see how many "None" records there are after the edits? [Press enter if no] ')
if n_r_a_e in yes_resp:
    none_count(df) 

# 140 OPTIONAL: This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
#the accuracy number 
# flip vader rating if opposite to stocktwits sentiment
f_v_r = input('Do you want to flip the Vader sentiment rating when it is the opposite of the Stocktwits sentiment rating? [Press enter if no] ')
if f_v_r in yes_resp:
    df = change_opp_nltk(df) #returns df

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment value
n_c_a_e = input('Do you want to see the number of "None" sentiments after the edit? [Press enter if no] ')
if n_c_a_e in yes_resp:
    none_count(df) '''

# methods

class SentimentAnalysisPreprocessing(self):
    
    def __init__(self, df):
        self.df = df
        
    # 10 initializes the dataframe "df" and imports the csv into df; 
    # the argument is the name/address of the file.
    # https://stackoverflow.com/questions/33440805/pandas-dataframe-read-csv-on-bad-data
    def getData(name):
        df1 = pd.DataFrame() # defines df1 as a dataframe
        df1 = pd.read_csv(name, header = 0)
        return df1

    # removes duplicate headers
    def remove_duplicate_headers(df):
        print('\nDropping duplicate headers ...')
        column = 'symbol'
        %time df.drop(df[df['symbol'] == column].index, inplace=True)
        df = df.reset_index(drop = True) # resets the index
        return df

    # 30 removes any duplicate records; duplicate records imply bot records
    def remove_duplicates(df):
        print('\nDropping duplicates ...')
        %time df = df.drop_duplicates()
        df = df.reset_index(drop = True) # resets the index
        len(df)
        return df

    # remove HTTP tags
    def remove_http_tags(df):
        print('\nRemoving http tags ...')
        %time df['body_processed'] = df['body'].map(lambda x : ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))
        return df

    # coverts to all lower case
    def lower_case(df):
        print('\nConverting to lower case ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x: x.lower())
        return df

    # removes all punctuation
    def remove_punctuation(df):
        print('\nRemoving punctuation ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x: re.sub(r'[^\w\s]', '', x))
        return df

    # removes unicodes (emojis)
    def remove_unicode(df):
        print('\nRemoving unicode ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
        return df

    def lemmatize(df, lemmer): #lemmer must be defined outside of the function and passed in
        print('\nLemmatizing ...')
        %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))
        return df

    # Remove stopwords
    def remove_stopwords(df, stop_words): #stop_words must be defined outside of the function and passed in
        print('\nRemoving stopwords ...')

        #adds new stopwords to list

        newStopWords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
        newStopWords += ['again', 'against', 'all', 'almost', 'alone', 'along']
        newStopWords += ['already', 'also', 'although', 'always', 'am', 'among']
        newStopWords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
        newStopWords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
        newStopWords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
        newStopWords += ['because', 'become', 'becomes', 'becoming', 'been']
        newStopWords += ['before', 'beforehand', 'behind', 'being', 'below']
        newStopWords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
        newStopWords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
        newStopWords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
        newStopWords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
        newStopWords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
        newStopWords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
        newStopWords += ['every', 'everyone', 'everything', 'everywhere', 'except']
        newStopWords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
        newStopWords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
        newStopWords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
        newStopWords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
        newStopWords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
        newStopWords += ['herself', 'him', 'himself', 'his', 'how', 'however']
        newStopWords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
        newStopWords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
        newStopWords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
        newStopWords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
        newStopWords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
        newStopWords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
        newStopWords += ['nevertheless', 'next', 'nine', 'nobody', 'none'] #removed 'no'
        newStopWords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
        newStopWords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
        newStopWords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
        newStopWords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
        newStopWords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
        newStopWords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
        newStopWords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
        newStopWords += ['some', 'somehow', 'someone', 'something', 'sometime']
        newStopWords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
        newStopWords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
        newStopWords += ['then', 'thence', 'there', 'thereafter', 'thereby']
        newStopWords += ['therefore', 'therein', 'thereupon', 'these', 'they']
        newStopWords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
        newStopWords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
        newStopWords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
        newStopWords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
        newStopWords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
        newStopWords += ['whatever', 'when', 'whence', 'whenever', 'where']
        newStopWords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
        newStopWords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
        newStopWords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
        newStopWords += ['within', 'without', 'would', 'yet', 'you', 'your']
        newStopWords += ['yours', 'yourself', 'yourselves'] #provided by Codecademy??

        # additional stopwords:
        newStopWords += ['[screenshot]', 'screenshot', '[screenshot]great', 'screenshot',
                         'the', 'smart', 'yah', 'got', 'nutty', 'moving', 'weeks', 'Got', 'So', 'today', 'Been', 'or']

        newStopWords += ['i', 'you', 'He', 'he', 'she', 'they', 'their', 'it'] # pronouns

        newStopWords += ['amd','nvda', 'tsla', 'goog', 'ba', 'fb', 'googl', 'intc', 'intel', 'csco', 'mu',
                         'smh', 'tsm','aapl', 'csco', 'poetf', 'photonics', 'dd', 'arwr', 't', 'infini', 'amc', 'arl',
                         'gme', 'nio', 'qs', 'msft', 'adbe', 'unh'] # Stock symbols or names

        newStopWords += [] # nouns

        #newStopWords += ['.', '?', '!', ';', ',', "'", '.'] # punctuation

        newStopWords += ['&', '#', '%', '$', '@', '/'] # symbols

        newStopWords += ['41.75', '530.05', '39', 'Two', 'two', 'One', 'one', 'Three', 'three', 'Four', 'four',
                        'Five', 'five', 'Six', 'six', 'Seven', 'seven', 'Eight', 'eight', 'Nine', 'nine', 'Ten',
                        'ten', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '39', ' 270',
                          '270000', '4033477', '244', '16', '399', '800', '270', '000', '60', '74',
                          '1600', '993', '392', '98', '00', '1601'] # numbers

        for w in newStopWords:
            stop_words.append(w)

        #print('stop_words: ', stop_words)

        #removes the stopwords from the column body_processed
        %time df['body_processed'] = df['body_processed'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

        return df

    # 40 finds certain words in the strings ('body') and deletes the entire record.
    #Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
    #that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
    #is decremented by one.
    #Also, the filtering terms are not case sensitive.
    def filter_records(df):
        import fnmatch

        data = []
        counter = 0
        advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', 
                  '*SweepCast*', '*Large Print*', '*Huge Print*', '*8-K*', 
                  '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
                  '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', 
                  '*New Article*', '*ooc.bz*', '*http*', 'Huge Trade', 'Trading is easy', 
                  'www.', '#wallstreetbets', 'wallstreetbets', 'Huge Trade', '#unitedtraders', 
                  'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

        for a in advert:
            i = 0
            df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
            while i < len(df):
                dat = df.loc[i, ('body')] # 2 represents the 'body' column
                data = [dat] # sets the string from the df into a list for the fnmatch.filter
                #print('index = ', i)
                filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
                #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

                if len(filtered) != 0: #if returns a True then record needs to be removed
                    counter += 1

                    df = df.drop(df.index[i]) # drops (deletes) the record
                    df = df.reset_index(drop = True) # resets the index; removes the gaps   

                    #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)

                    #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                    #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                    #the record that was just checked. That is why it takes multiple runs to remove all of the target
                    #records. To correct this decrement the index, i, by

                    i -= 1

                i += 1

        df = df.reset_index(drop = True) # resets the index; removes the gaps   
        len(df)
        return df

    #50 Vader sentiment analyzer
    def vader_sentiment(df):
        vader = SentimentIntensityAnalyzer()

        f = lambda tweet: vader.polarity_scores(tweet)['compound']

        df['raw_compound'] = df['body'].apply(f)

        print('The number of clean records in the df are: ', len(df) , '\n')
        #print(df.head())

        return df

    # 60 creates a new column called 'compound_bin' from the raw_compound scores. This creates a column that the raw 
    #where the translated raw compound scores will be placed (either a -1, 0, 1.)
    def compound_binning(df):
        df['compound_bin'] = df['raw_compound'] # Creates a column called 'compound_bin'

        #del df['Unnamed: 0'] # deletes the column named 'Unnamed: 0'

        #print(df.head())

        # 70 converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
        #and -1 if <= -.1 and over-rights the value in compound_bin

        i = 0
        while i < len(df):
            if df.loc[i,('raw_compound')] >= 0.1: # column 5 is 'raw_compound'
                df.loc[i, ('compound_bin')] =  np.int(df.loc[i, ('raw_compound')] + .9) # column 6 is 'compound_bin'

            if df.loc[i,('raw_compound')]  < .1 and df.loc[i,('raw_compound')] > -.1:
                df.loc[i, ('compound_bin')] = 0   

            if df.loc[i,('raw_compound')]  <= -.1:
                df.loc[i, ('compound_bin')] =  np.int(df.loc[i,('raw_compound')] - .9)
            i += 1

        #print(df)

        return df

    # 80 Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
    #Stocktwits sentiment rating (bullish or Bearish) is used as the standard;
    #Stocktwits sentiment rating of 'None' is not used as a standard because people could have simply elected to not enter it.
    #https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/
    def convert_sentiment_to_numerical(df):

        print('\nconverting sentiment values to numerical values ...')
        import numpy as np

        conditions = [(df['sentiment'] == 'Bullish'), (df['sentiment'] == 'None'), (df['sentiment'] == 'Bearish')] #column 4 is 'sentiment'

        values = [1.0, 0.0, -1.0]

        %time df['sentiment_number'] = np.select(conditions, values)

        df['modified_rating'] = 0 # adds a column "modified_rating" and sets it equal to 0
        df['modified?'] = 'No' # adds a column "modified?" and sets it equal to 'No'


        #print(df)

        return df

    # 90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
    def vader_correct(df):
        correct = 0
        incorrect = 0
        total = len(df)
        i = 0
        while i < len(df):
            if df.loc[i, ('compound_bin')] == df.loc[i, ('sentiment_number')]: # column 6 is 'compound_bin' and column 7 is 'sentiment_number'
                correct += 1
            else:
                incorrect += 1 

            i += 1

        print('The Vader percent correct to stocktwits raw data is:', int(100 * correct/total), '%')
        print('The Vader percent incorrect to stocktwits raw data is:', int(100 * incorrect/total), '%')

        #return df

    # 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
    def none_count_raw(df):
        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'None': # column 4 is 'sentiment'
                sentiment_number += 1
            i += 1

        print('The number of "None" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    # 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
    #the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
    #of responses.
    def remove_every_other(df):
        i = 0
        counter_before = 0
        counter_after = 0
        df = df.reset_index(drop = True) #resets the index to be continuous 

        while i < len(df): #count the 'None' records before the drop.
            if df.loc[i,('sentiment')] == 'None':
                counter_before += 1
            i += 1

        print('\nThe total number of records is: ', len(df))
        print('The number of "None" stocktwits sentiment values before removal is:', counter_before)

        i = 0
        while i < len(df):
            if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
                if i % 2 == 0: #identifies every even index where the sentiment is "None"
                    #df = df.drop(df.index[i]) #drops (deletes) the record
                    df = df.drop(df.index[i])
                    df = df.reset_index(drop = True) #resets the index to be continuous

                    i -= 1


            i += 1

        df = df.reset_index(drop = True) #resets the index to be continuous 

        i = 0
        counter_after = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'None':
                counter_after += 1
            i += 1

        print('\nThe total number of records is: ', len(df))
        print('The number of "None" stocktwits sentiment values after removal is:', counter_after)
        print('The percentage of "None" values is:', (int(counter_after/len(df) * 1000)/10), '%')

        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'Bullish':
                sentiment_number += 1
            i += 1

        print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'Bearish':
                sentiment_number += 1
            i += 1

        print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '% \n')

        return df    

    # 115 Provides statistics on sentiments; bullish, none or bearish.
    def stats(df):

        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'None':
                sentiment_number += 1
            i += 1

        print('The total number of records is: ', len(df))
        print('The number of "None" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'Bullish':
                sentiment_number += 1
            i += 1

        print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('sentiment')] == 'Bearish':
                sentiment_number += 1
            i += 1

        print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    # 120 Allows user to manually input value when stocktwits sentiment value is "None"
    # It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
    # it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
    # upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
    # the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
    # it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"
    def edit(df):

        import copy

        i = 0
        counter = 0    # counter to see if user want to stop

        while i < len(df):
        #while i < 6:

            if df.loc[i,('sentiment')] == 'None' and df.loc[i,('modified?')] == 'No': # Column 9 is 'modified?'
                print('\nindex number:', i, '\n', df.loc[i, ('body')])
                #print('This is the body of the tweet:\n', df..log[i,('body')])
                rating = int(input('Enter your rating (1, 0 or -1.):')) 
                df.loc[i,('modified_rating')] = copy.deepcopy(rating) # writes inputed number to the 'modified_rating'
                df.loc[i,('modified?')] = 'Yes' # sets "modified?" equal to 'Yes' to identify which records have been modified; so that it can start at the next record at start up

                counter += 1

            elif df.loc[i,('sentiment')] == 'Bearish':

                df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)

            elif df.loc[i,('sentiment')] == 'Bullish':

                df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)

            if counter == 20: # represents 20 edits
                quit = input('Do you want to quit? (Enter either a "y" or "Y") ')
                if quit == 'y' or quit == 'Y':
                    print('You are exiting.')
                    break
                else:
                    counter = 0 # resets the counter to 0 so there must be another 20 records reviewed and modified 

            i += 1

        #df.to_csv(filename, index = False)
        #print('The csv file was written. File name: ', filename)

        return df

    # 140 This will change the modified rating (8) to the nltk rating (6) only when they are opposite to see if it improves 
    #the accuracy number 
    def change_opp_nltk(df):

        filename = 'tech stockTwit 02232021 opposite compound_bin vs modified_rating.csv'

        print('The name of the csv file that will be written to is: ', filename)

        correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')

        if correct_name == 'N' or correct_name == 'n':
              new_name = input('What is the correct name?')
              filename = new_name

        i = 0

        import copy

        counter = 0    # counter to see if user want to stop

        while i < len(df):

            if df.loc[i,('sentiment')] == -1 and df.loc[i,('modified_rating')] == 1:
                df.loc[i,('modified_rating')] = copy.deepcopy(df.loc[i,('sentiment')]) # change "modified_rating" (8) to "compound_bin" (6)      

            elif df.loc[i,('sentiment')] == 1 and df.loc[i,('modified_rating')] == -1:
                df.loc[i,('modified_rating')] = copy.deepcopy(df.loc[i,('sentiment')]) # change "modified_rating" to "compound_bin"     

            i += 1

        df.to_csv(filename, index = False)
        print('The csv file was written. File name: ', filename)

        return df

    # 180 counts how many "None" sentiment values are there for the stocktwits sentiment modified rating values
    def none_count(df):
        i = 0
        sentiment_number = 0

        while i < len(df):
            if df.loc[i,('modified_rating')] == 0.0: # column #8 is 'modified_rating'
                sentiment_number += 1
            i +=1

        print('The number of "None" stocktwits sentiment values is:', sentiment_number)
        print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')


    #480 This removes words from the list of stopwords and writes list to csv file
    # https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python#:~:text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
    #new_words = list(filter(lambda w: w not in stop_words, initial_words))
    def remove_from_stopwords(sw, relevant_path):
        WordsToBeRem = ['no']
        stopWords = list(filter(lambda w: w not in WordsToBeRem, sw)) #It will retain anyword in sw that is not in WordsToBeRemoved

        #converts the stopword list to a df so that it can then be written to a csv file
        df_stopwords = pd.DataFrame(stopWords, columns = ['stopwords'])
        name_of_csv_file = relevant_path + '/' + 'stopwords.csv'
        df_stopwords.to_csv(name_of_csv_file, index = False) #writes stopwords to csv file

        #print(stopWords)

        return stopWords

    #490 Checks to see of the words were removed from the stopWords list.
    #inputs: stopword list (sw) and the word to be removed from the so (WordToBeRem):
    def check_stopwords(sw, WordToBeRem):

        r = 0

        for w in sw:
            #print(w)
            if w == WordToBeRem:
                print('The word ', w , ' is still in the stopWords list!')
                r += 1

        if r == 0:
            print('It did remove the words from the stopWords list!')

        #print(len(stopWords))

    #510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
    # the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
    # match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
    # It then joins the individual words back into a string.
    #There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
    # separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
    #dfScrubbed = df #This is a shallow copy
    def rem_stopwords(df, stopWords):

        from nltk.tokenize import sent_tokenize, word_tokenize

        dfScrubbed = df.copy() #This is a deep copy. df.copy(deep = True); deep = True is default

        i = 0
        while i < len(df):

            data = df.loc[i,('body')]
            words = word_tokenize(data) # separates the string into a individual words.
            wordsFiltered = []

            for w in words:
                if w not in stopWords:
                    wordsFiltered.append(w) # makes a new word list without the stopwords

            joinedWordsFiltered = ' '.join(wordsFiltered)

            dfScrubbed.loc[i,('body')] = joinedWordsFiltered # replaces the recorded in dfScrubbed with the stopWords removed
            # from the 'body'

            i += 1

        #print(wordsFiltered)

        #### method removes empty body rows and reindexes
        dfScrubbed = remove_empty_body_rows(dfScrubbed)

        #### checks to see if there are any empty records left
        print('Are there any empty body records?')
        empty = np.where(pd.isnull(dfScrubbed['body'])) #checks to see if there are any empty records in the column 'body'
        print(empty)

        #print(dfScrubbed.head())

        return dfScrubbed

    #550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
    # this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
    def int_conversion(dfs):
        dfs['scrubbed_compound'] =  np.int64((dfs['scrubbed_compound'] + .05) * 10)

    # 550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
    # if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
    # creates a new column called 'compound_bin' from the raw_compound scores
    def bin_sentiment(dfs):
        dfs['scrubbed_compound_bin'] = dfs['scrubbed_compound'] # creates a new column 'scrubbed_compound_bin' (column 11)

        i = 0
        while i < len(df):
            if dfs.loc[i,('scrubbed_compound')] >= 0.1: # column 10 is 'scrubbed_compound'
                dfs.loc[i, ('scrubbed_compound_bin')] =  np.int(dfs.loc[i,('scrubbed_compound')] + .9) # column 11 is 'scurbbed_compound_bin'

            if dfs.loc[i,('scrubbed_compound')] < .1 and dfs.loc[i,('scrubbed_compound')] > -.1:
                dfs.iloc[i, 11] = 0   

            if dfs.loc[i,('scrubbed_compound')] <= -.1:
                dfs.loc[i, ('scrubbed_compound_bin')] =  np.int(dfs.loc[i,('scrubbed_compound')]  - .9)
            i += 1

        print(dfs)

    #640 compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
    #inputs: df - original df; dfs - scrubbed df (stopwords removed)
    def compare_scrubbed(df, dfs):
        print(df.loc[0,('body')])
        print(dfs.loc[0,('body')])

    # 650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
    #values have been modified; this is to see if increased records will increase the accuracy of the model.
    def combine_dfs(df1, df2):

        df = df1.append(df2)

        print('The length of file 1 is:', len(df1))
        print('The length of file 2 is:', len(df2))

        print('The length of the combined dataframe is:', len(df))

        return df

    # 660 Writes a csv file
    #input: df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'
    def write_csv(df, filename_output, relevant_path):

        df.to_csv(relevant_path + '/' + filename_output, index = False, encoding = 'utf-8')
        print('The csv file was written. File name: ', filename_output)

    # displays a list of file with on a csv suffix       
    def list_dir_files(relevant_path):
        # https://clay-atlas.com/us/blog/2019/10/27/python-english-tutorial-solved-unicodeescape-error-escape-syntaxerror/?doing_wp_cron=1618286551.1528689861297607421875
        #need to change \ to /

        import os

        included_extensions = ['csv']
        file_names = [fn for fn in os.listdir(relevant_path) # uses os.listdir to display only .csv files
                  if any(fn.endswith(ext) for ext in included_extensions)]

        print('Path: ', relevant_path)

        for f in file_names:
            print(f)

    # removes specific rows and resets the index
    def remove_empty_body_rows(df):
        df.dropna(subset=['body'], inplace=True) #drops empty body records
        df = df.reset_index(drop = True) # resets the index
        return df

    #### checks to see if there are any empty records left
    def empty_records_check(df):
        print('Are there any empty body records?')
        empty = np.where(pd.isnull(df['body'])) #checks to see if there are any empty records in the column 'body'

        if empty[0].size == 0:
            print('There are no empty records! \n', empty)
        else:
            print('There are empty records ...\n', empty)

    #### Removes Imogis
    def remove_emoji(string):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)

    # combines both names of file wanted to combing and writes csv file
    def combine_two_files():
        first_filename = input()

    def rem_dup_adver_ever_oth_emoji(df):
        #remove duplicates
        r_d = input('Do you want to remove duplicates? [Press enter if no] ')
        if r_d in yes_resp:
            df = remove_duplicates(df) #return df; removes duplicates
            remove_dupl = 'r_d '
        else:
            remove_dupl = ''

        #remove advertisements
        r_a = input('Do you want to remove advertisements? [Press enter if no] ')
        if r_a in yes_resp:
            df = filter_records(df) #returns df; removes addvertisements
            remove_advertisements = 'r_a '
        else:
            remove_advertisements = ''

        # 110 OPTIONAL: This removes every other "None" record to reduce the total number of "None" rating. This is to make
        #the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
        #of responses.
        r_e_o = input('Do you want to remove every other neutral sentiment record: [Press enter if no] ')
        if r_e_o in yes_resp:
            df = remove_every_other(df) #returns df
            rem_every_other = 'r_e_o '
        else:
            rem_every_other = ''

        # remove emojis
        r_emoj = input('Do you want to remove emojis from the body records: [Press enter if no] ')
        if r_emoj in yes_resp:
            #print('location1')
            i = 0
            #print('location2')
            while i < len(df):
                #print('location3', i)
                string = df.loc[i, ('body')]
                #print('location4')
                #print('original string: ', string)
                new_string = remove_emoji(string)
                #print('location5')
                #print('new string: ', new_string)
                df.loc[i, ('body')] = new_string
                #print(df['body'][i])

                r_emoji = 'r_emoj '

                i += 1
        else:
            r_emoji = ''

        return df, r_emoji, rem_every_other, remove_advertisements, remove_dupl


    def vader_analysis(df): #performs Vader sentiment analysis and adds to df the compound binning and converts the stocktwits string value to a numerical value.
        df = vader_sentiment(df) #returns df; adds column with Vader sentiment values ('raw_compound') from the 'body' column.
        print('Produced Vader sentiment values.')

        df = compound_binning(df) #returns df; adds a column where the raw_compound scores are translated into 1, 0 or -1 'compound_bin'
        print('Completed the Vader compound binning.')

        df = convert_sentiment_to_numerical(df) #returns df
        print('Converted the Stocktwits sentiments to a numberical value (1,0,-1).')
        print('\nAll finished with the Vader sentiment analysis.\n')

        return df

In [None]:
relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files'
#relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Preprocessing'


print('Here is a list of the csv files to choose from: \n')
list_dir_files(relevant_path) # gives all of the file options in the relevant path.

time.sleep(2)

name = input('\nWhat file do you want to use? \n')
df = getData(relevant_path + '/' + name) #returns df; reads csv file into df
print('Imported the csv file.')

finviz_resp = input('Is this a file from scraping Finviz?')