In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.classify
from nltk import NaiveBayesClassifier
import os
import re
import sys
import time
analyzer = SentimentIntensityAnalyzer()

Revision History

This program takes raw data from Stocktwits and preprocesses the data to generate NLP multinomial naive bayes (MNB) sentiment model. It runs the Vader model on the raw data first. There are three standard sentiment values that can be used to determine sentiment. They are:
    o NLTK Vader using raw data (Rev 2.0)
    o NLTK Vader using preprocessed (Rev 2.1)
    o MNB using raw data (Rev 2.0)
    o MNB using preprocessed data (Rev 2.0)

Rev 2.0
    o Peforms NLTK Vader sentiment analysis on Stocktweets tweets
    o Gives the user the option to:
        x remove duplicates
        x remove adverstisements
        x edit the neutral or None comments
        x remove half of the neutral or None comments
        x remove stopwords
        x write a csv file
        x combine two csv files into one dataframe
        
Rev 2.1
    o Gives the user the choice of running the Vader model after all of the processing options have been exhausted.
    o Adds the ability to remove emojis
    
Rev 2.2
    o converts all statements using iloc to loc
    
Rev 2.3
    o gives the user the choice to run the Vader analysis before or after the removal of duplicates, advertisements, emojis and stopwords.
  
        

************************
Table of Contents
#10* initializes the dataframe "df" and imports the csv into df; 
#20* calls getdata to import the csv into the dataframe, 'dfAPI'
#30 removes any duplicate records; duplicate records imply bot records
#40 finds certain words in the strings ('body') and deletes the entire record.  
#50* Vader sentiment analyzer
#60* creates a new column called 'compound_bin' from the raw_compound scores
#70* converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
#80* Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
#90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
#100 counts how many "None" sentiment values are there for the stocktwits sentiment value
#110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#115 Provides statistics on sentiments; bullish, none or bearish.
#120 Allows user to manually input value when stocktwits sentiment value is "None"
#130 Loads a csv file into the df dfAPI and print out the first 21 records
#140 This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
the accuracy number 
#440 sets up stopword removal; returns stopWords
#470 creates a list of new stopwords and then adds them to the set provided by nltk
Note: it is case sensitive; Input is the nltk stopword list ("stopWords")
#490 Checks to see of the words were removed from the stopWords list.
inputs: stopword list: output from def remove_from_stopwords(sw); the word to be removed
#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
It then joins the individual words back into a string.
There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
dfScrubbed = df #This is a shallow copy
#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
#550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
creates a new column called 'compound_bin' from the raw_compound scores
#640 compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
inputs: df - original df; dfs - scrubbed df (stopwords removed)
#650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
values have been modified; this is to see if increased records will increase the accuracy of the model.
#660 Writes a csv file

METHODS

In [18]:
# methods

# 10 initializes the dataframe "df" and imports the csv into df; 
# the argument is the name/address of the file.
# https://stackoverflow.com/questions/33440805/pandas-dataframe-read-csv-on-bad-data
def getData(name):
    df1 = pd.DataFrame() # defines df1 as a dataframe
    df1 = pd.read_csv(name, header = 0)
    return df1

# 30 removes any duplicate records; duplicate records imply bot records
def remove_duplicates(df):
    df = df.drop_duplicates()
    len(df)
    return df

# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.
def filter_records(df):
    import fnmatch

    data = []
    counter = 0
    advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

    for a in advert:
        i = 0
        df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
        while i < len(df):
            dat = df.loc[i, ('body')] # 2 represents the 'body' column
            data = [dat] # sets the string from the df into a list for the fnmatch.filter
            #print('index = ', i)
            filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
            #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

            if len(filtered) != 0: #if returns a True then record needs to be removed
                counter += 1
            
                df = df.drop(df.index[i]) # drops (deletes) the record
                df = df.reset_index(drop = True) # resets the index; removes the gaps   

                #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)
                
                #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                #the record that was just checked. That is why it takes multiple runs to remove all of the target
                #records. To correct this decrement the index, i, by
                
                i -= 1
    
            i += 1

    df = df.reset_index(drop = True) # resets the index; removes the gaps   
    len(df)
    return df

#50 Vader sentiment analyzer
def vader_sentiment(df):
    vader = SentimentIntensityAnalyzer()

    f = lambda tweet: vader.polarity_scores(tweet)['compound']

    df['raw_compound'] = df['body'].apply(f)

    print('The number of clean records in the df are: ', len(df) , '\n')
    #print(df.head())
    
    return df

# 60 creates a new column called 'compound_bin' from the raw_compound scores. This creates a column that the raw 
#where the translated raw compound scores will be placed (either a -1, 0, 1.)
def compound_binning(df):
    df['compound_bin'] = df['raw_compound'] # Creates a column called 'compound_bin'
    
    #del df['Unnamed: 0'] # deletes the column named 'Unnamed: 0'
    
    #print(df.head())
    
    # 70 converts the 'raw_compound' data to either a 1, 0 or -1. 1 if nltk sentiment number are >= .1; 0 if -.1 < x < .1 
    #and -1 if <= -.1 and over-rights the value in compound_bin

    i = 0
    while i < len(df):
        if df.loc[i,('raw_compound')] >= 0.1: # column 5 is 'raw_compound'
            df.loc[i, ('compound_bin')] =  np.int(df.loc[i, ('raw_compound')] + .9) # column 6 is 'compound_bin'
        
        if df.loc[i,('raw_compound')]  < .1 and df.loc[i,('raw_compound')] > -.1:
            df.loc[i, ('compound_bin')] = 0   
        
        if df.loc[i,('raw_compound')]  <= -.1:
            df.loc[i, ('compound_bin')] =  np.int(df.loc[i,('raw_compound')] - .9)
        i += 1
    
    #print(df)
    
    return df

# 80 Converts sentiment ratings into numerical values and put the value into 'sentiment_number'.
#Stocktwits sentiment rating (bullish or Bearish) is used as the standard;
#Stocktwits sentiment rating of 'None' is not used as a standard because people could have simply elected to not enter it.
#https://www.dataquest.io/blog/tutorial-add-column-pandas-dataframe-based-on-if-else-condition/
def convert_sentiment_to_numerical(df):
    import numpy as np

    conditions = [(df['sentiment'] == 'Bullish'), #column 4 is 'sentiment'
                  (df['sentiment'] == 'None'),
                  (df['sentiment'] == 'Bearish')]

    values = [1.0, 0.0, -1.0]

    df['sentiment_number'] = np.select(conditions, values)

    df['modified_rating'] = 0 # adds a column "modified_rating" and sets it equal to 0
    df['modified?'] = 'No' # adds a column "modified?" and sets it equal to 'No'


    #print(df)
    
    return df

# 90 Determines the percent correct and incorrect for the Vader sentiment values vs the stocktwits sentiment values
def vader_correct(df):
    correct = 0
    incorrect = 0
    total = len(df)
    i = 0
    while i < len(df):
        if df.loc[i, ('compound_bin')] == df.loc[i, ('sentiment_number')]: # column 6 is 'compound_bin' and column 7 is 'sentiment_number'
            correct += 1
        else:
            incorrect += 1 
        
        i += 1
        
    print('The Vader percent correct to stocktwits raw data is:', int(100 * correct/total), '%')
    print('The Vader percent incorrect to stocktwits raw data is:', int(100 * incorrect/total), '%')

    #return df

# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
def none_count_raw(df):
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'None': # column 4 is 'sentiment'
            sentiment_number += 1
        i += 1

    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
        
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 0
    counter_before = 0
    counter_after = 0
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    while i < len(df): #count the 'None' records before the drop.
        if df.loc[i,('sentiment')] == 'None':
            counter_before += 1
        i += 1

    print('\nThe total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values before removal is:', counter_before)

    i = 0
    while i < len(df):
        if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                #df = df.drop(df.index[i]) #drops (deletes) the record
                df = df.drop(df.index[i])
                df = df.reset_index(drop = True) #resets the index to be continuous
                
                i -= 1

            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 

    i = 0
    counter_after = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'None':
            counter_after += 1
        i += 1

    print('\nThe total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values after removal is:', counter_after)
    print('The percentage of "None" values is:', (int(counter_after/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '% \n')
            
    return df    

# 115 Provides statistics on sentiments; bullish, none or bearish.
def stats(df):
    
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'None':
            sentiment_number += 1
        i += 1

    print('The total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
# 120 Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"
def edit(df):

    import copy
        
    i = 0
    counter = 0    # counter to see if user want to stop

    while i < len(df):
    #while i < 6:

        if df.loc[i,('sentiment')] == 'None' and df.loc[i,('modified?')] == 'No': # Column 9 is 'modified?'
            print('\nindex number:', i, '\n', df.loc[i, ('body')])
            #print('This is the body of the tweet:\n', df..log[i,('body')])
            rating = int(input('Enter your rating (1, 0 or -1.):')) 
            df.loc[i,('modified_rating')] = copy.deepcopy(rating) # writes inputed number to the 'modified_rating'
            df.loc[i,('modified?')] = 'Yes' # sets "modified?" equal to 'Yes' to identify which records have been modified; so that it can start at the next record at start up
        
            counter += 1
        
        elif df.loc[i,('sentiment')] == 'Bearish':

            df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)
        
        elif df.loc[i,('sentiment')] == 'Bullish':
        
            df.loc[i,('modified_rating')] = df.loc[i,('sentiment_number')] #copies the stocktwits 'sentiment_number' (7) to the 'modified_rating(8)

        if counter == 20: # represents 20 edits
            quit = input('Do you want to quit? (Enter either a "y" or "Y") ')
            if quit == 'y' or quit == 'Y':
                print('You are exiting.')
                break
            else:
                counter = 0 # resets the counter to 0 so there must be another 20 records reviewed and modified 
        
        i += 1
    
    #df.to_csv(filename, index = False)
    #print('The csv file was written. File name: ', filename)
    
    return df

# 140 This will change the modified rating (8) to the nltk rating (6) only when they are opposite to see if it improves 
#the accuracy number 
def change_opp_nltk(df):
    
    filename = 'tech stockTwit 02232021 opposite compound_bin vs modified_rating.csv'
    
    print('The name of the csv file that will be written to is: ', filename)
    
    correct_name = input('Is this the correct filename? (enter "N" or "n" for no)')
          
    if correct_name == 'N' or correct_name == 'n':
          new_name = input('What is the correct name?')
          filename = new_name
    
    i = 0

    import copy

    counter = 0    # counter to see if user want to stop

    while i < len(df):

        if df.loc[i,('sentiment')] == -1 and df.loc[i,('modified_rating')] == 1:
            df.loc[i,('modified_rating')] = copy.deepcopy(df.loc[i,('sentiment')]) # change "modified_rating" (8) to "compound_bin" (6)      
        
        elif df.loc[i,('sentiment')] == 1 and df.loc[i,('modified_rating')] == -1:
            df.loc[i,('modified_rating')] = copy.deepcopy(df.loc[i,('sentiment')]) # change "modified_rating" to "compound_bin"     

        i += 1
    
    df.to_csv(filename, index = False)
    print('The csv file was written. File name: ', filename)
    
    return df

# 180 counts how many "None" sentiment values are there for the stocktwits sentiment modified rating values
def none_count(df):
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('modified_rating')] == 0.0: # column #8 is 'modified_rating'
            sentiment_number += 1
        i +=1
    
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
    
#440 sets up stopword removal; returns stopWords
def set_up_nltk_stopword_removal():
    #from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    import nltk
    nltk.download('stopwords')
    stopWords = set(stopwords.words('english'))

    print(len(stopWords))
    return stopWords

#470 creates a list of new stopwords and then adds them to the set provided by nltk
#Note: it is case sensitive
#Input is the nltk stopword list ("stopWords")
def add_new_stopwords(sw):
    newStopWords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
    newStopWords += ['again', 'against', 'all', 'almost', 'alone', 'along']
    newStopWords += ['already', 'also', 'although', 'always', 'am', 'among']
    newStopWords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
    newStopWords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
    newStopWords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
    newStopWords += ['because', 'become', 'becomes', 'becoming', 'been']
    newStopWords += ['before', 'beforehand', 'behind', 'being', 'below']
    newStopWords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
    newStopWords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
    newStopWords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
    newStopWords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
    newStopWords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
    newStopWords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
    newStopWords += ['every', 'everyone', 'everything', 'everywhere', 'except']
    newStopWords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
    newStopWords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
    newStopWords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
    newStopWords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
    newStopWords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
    newStopWords += ['herself', 'him', 'himself', 'his', 'how', 'however']
    newStopWords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
    newStopWords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
    newStopWords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
    newStopWords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
    newStopWords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
    newStopWords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
    newStopWords += ['nevertheless', 'next', 'nine', 'nobody', 'none'] #removed 'no'
    newStopWords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
    newStopWords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
    newStopWords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
    newStopWords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
    newStopWords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
    newStopWords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
    newStopWords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
    newStopWords += ['some', 'somehow', 'someone', 'something', 'sometime']
    newStopWords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
    newStopWords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
    newStopWords += ['then', 'thence', 'there', 'thereafter', 'thereby']
    newStopWords += ['therefore', 'therein', 'thereupon', 'these', 'they']
    newStopWords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
    newStopWords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
    newStopWords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
    newStopWords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
    newStopWords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
    newStopWords += ['whatever', 'when', 'whence', 'whenever', 'where']
    newStopWords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
    newStopWords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
    newStopWords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
    newStopWords += ['within', 'without', 'would', 'yet', 'you', 'your']
    newStopWords += ['yours', 'yourself', 'yourselves'] #provided by Codecademy??

    # additional stopwords:
    newStopWords += ['[Screenshot]', '[screenshot]', 'Screenshot', '[Screenshot]Great', '[SCREENSHOT]', 'screenshot', 
                 'The', 'the', 'SMART', 'yah', 'got', 'nutty', 'moving', 'weeks', 'Got', 'So', 'today', 'Been', 'or']

    newStopWords += ['I', 'You', 'you', 'He', 'he', 'She', 'she', 'They', 'they', 'Their', 'their', 'it', 'It'] # pronouns

    newStopWords += ['AMD', 'NVDA','NVDA', 'TSLA', 'GOOG', 'BA', 'FB', 'GOOGL', 'INTC', 'intel', 'Intel', 'CSCO', 'MU', 
                 'SMH', 'TSM','AAPL', 'TSLA', 'CSCO', 'POETF', 'PHOTONICS', 'DD', 'ARWR', 'T', 'INFI', 'AMC', 'ARK',
                'GME', 'NIO', 'QS', 'INTC'] # Stock symbols or names

    newStopWords += ['Readytogo123', 'Maddog68','Stocktwits', 'Big Trade'] # nouns

    newStopWords += ['.', '?', '!', ';', ',', "'", '.'] # punctuation

    newStopWords += ['&', '#', '%', '$', '@', '/'] # symbols

    newStopWords += ['41.75', '530.05', '39', 'Two', 'two', 'One', 'one', 'Three', 'three', 'Four', 'four',
                    'Five', 'five', 'Six', 'six', 'Seven', 'seven', 'Eight', 'eight', 'Nine', 'nine', 'Ten',
                    'ten', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # numbers

    #adds them to the stopWords list provided by nltk
    for i in newStopWords:
        sw.add(i) #stopWords is defined as a "set" in #450 when inputed as english words from nltk;
        # sets cannot be ordered so it must be converted back to a list to be ordered or alphabetized. A set has no duplicate elements.

    print('The length of the stopword list is: ', len(sw))
    #print(stopWords)

    #converts the set to a list
    stopWords_list = list(sw)

    #sorts the stopword list
    stopWords_list.sort(key = lambda k : k.lower())
    #print(stopWords_list)
    
    return stopWords_list

#480 This removes words from the list of stopwords and writes list to csv file
# https://stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python#:~:text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
#new_words = list(filter(lambda w: w not in stop_words, initial_words))
def remove_from_stopwords(sw, relevant_path):
    WordsToBeRem = ['no']
    stopWords = list(filter(lambda w: w not in WordsToBeRem, sw)) #It will retain anyword in sw that is not in WordsToBeRemoved

    #converts the stopword list to a df so that it can then be written to a csv file
    df_stopwords = pd.DataFrame(stopWords, columns = ['stopwords'])
    name_of_csv_file = relevant_path + '/' + 'stopwords.csv'
    df_stopwords.to_csv(name_of_csv_file, index = False) #writes stopwords to csv file

    #print(stopWords)
    
    return stopWords

#490 Checks to see of the words were removed from the stopWords list.
#inputs: stopword list (sw) and the word to be removed from the so (WordToBeRem):
def check_stopwords(sw, WordToBeRem):
    
    r = 0

    for w in sw:
        #print(w)
        if w == WordToBeRem:
            print('The word ', w , ' is still in the stopWords list!')
            r += 1

    if r == 0:
        print('It did remove the words from the stopWords list!')
    
    #print(len(stopWords))

#510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
# the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
# match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
# It then joins the individual words back into a string.
#There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
# separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
#dfScrubbed = df #This is a shallow copy
def rem_stopwords(df, stopWords):
    
    from nltk.tokenize import sent_tokenize, word_tokenize
    
    dfScrubbed = df.copy() #This is a deep copy. df.copy(deep = True); deep = True is default

    i = 0
    while i < len(df):
    
        data = df.loc[i,('body')]
        words = word_tokenize(data) # separates the string into a individual words.
        wordsFiltered = []

        for w in words:
            if w not in stopWords:
                wordsFiltered.append(w) # makes a new word list without the stopwords
    
        joinedWordsFiltered = ' '.join(wordsFiltered)
    
        dfScrubbed.loc[i,('body')] = joinedWordsFiltered # replaces the recorded in dfScrubbed with the stopWords removed
        # from the 'body'
    
        i += 1
    
    #print(wordsFiltered)
    
    #### method removes empty body rows and reindexes
    dfScrubbed = remove_empty_body_rows(dfScrubbed)
    
    #### checks to see if there are any empty records left
    print('Are there any empty body records?')
    empty = np.where(pd.isnull(dfScrubbed['body'])) #checks to see if there are any empty records in the column 'body'
    print(empty)
    
    #print(dfScrubbed.head())
    
    return dfScrubbed

#550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
# this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
def int_conversion(dfs):
    dfs['scrubbed_compound'] =  np.int64((dfs['scrubbed_compound'] + .05) * 10)

# 550 converts the 'scrubbed_compound' (column 10) data to either a 1, 0 or -1.  
# if nltk sentiment number are >= .1; 0 if -.1 < x < .1 and -1 if <= -.1 and over-rights the value in compound_bin
# creates a new column called 'compound_bin' from the raw_compound scores
def bin_sentiment(dfs):
    dfs['scrubbed_compound_bin'] = dfs['scrubbed_compound'] # creates a new column 'scrubbed_compound_bin' (column 11)

    i = 0
    while i < len(df):
        if dfs.loc[i,('scrubbed_compound')] >= 0.1: # column 10 is 'scrubbed_compound'
            dfs.loc[i, ('scrubbed_compound_bin')] =  np.int(dfs.loc[i,('scrubbed_compound')] + .9) # column 11 is 'scurbbed_compound_bin'
        
        if dfs.loc[i,('scrubbed_compound')] < .1 and dfs.loc[i,('scrubbed_compound')] > -.1:
            dfs.iloc[i, 11] = 0   
        
        if dfs.loc[i,('scrubbed_compound')] <= -.1:
            dfs.loc[i, ('scrubbed_compound_bin')] =  np.int(dfs.loc[i,('scrubbed_compound')]  - .9)
        i += 1
    
    print(dfs)

#640 compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
#inputs: df - original df; dfs - scrubbed df (stopwords removed)
def compare_scrubbed(df, dfs):
    print(df.loc[0,('body')])
    print(dfs.loc[0,('body')])

# 650 Loads and combines two different dataframes in df; this is to combine two input datasets where the 'none'
#values have been modified; this is to see if increased records will increase the accuracy of the model.
def combine_dfs(df1, df2):

    df = df1.append(df2)

    print('The length of file 1 is:', len(df1))
    print('The length of file 2 is:', len(df2))

    print('The length of the combined dataframe is:', len(df))
    
    return df

# 660 Writes a csv file
#input: df that is to be saved as a csv; output file name (eg 'tech stockTwit 03112021 dup advert stopwords.csv'
def write_csv(df, filename_output, relevant_path):
    
    df.to_csv(relevant_path + '/' + filename_output, index = False, encoding = 'utf-8')
    print('The csv file was written. File name: ', filename_output)
    
# displays a list of file with on a csv suffix       
def list_dir_files(relevant_path):
    # https://clay-atlas.com/us/blog/2019/10/27/python-english-tutorial-solved-unicodeescape-error-escape-syntaxerror/?doing_wp_cron=1618286551.1528689861297607421875
    #need to change \ to /

    import os
    
    included_extensions = ['csv']
    file_names = [fn for fn in os.listdir(relevant_path) # uses os.listdir to display only .csv files
              if any(fn.endswith(ext) for ext in included_extensions)]

    print('Path: ', relevant_path)

    for f in file_names:
        print(f)

# removes duplicate headers
def remove_duplicate_headers(df):
    column = 'symbol'
    df.drop(df[df['symbol'] == column].index, inplace=True)
    df = df.reset_index(drop = True) # resets the index
    return df

# removes specific rows and resets the index
def remove_empty_body_rows(df):
    df.dropna(subset=['body'], inplace=True) #drops empty body records
    df = df.reset_index(drop = True) # resets the index
    return df

#### checks to see if there are any empty records left
def empty_records_check(df):
    print('Are there any empty body records?')
    empty = np.where(pd.isnull(df['body'])) #checks to see if there are any empty records in the column 'body'
    
    if empty[0].size == 0:
        print('There are no empty records! \n', empty)
    else:
        print('There are empty records ...\n', empty)
        
#### Removes Imogis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# combines both names of file wanted to combing and writes csv file
def combine_two_files():
    first_filename = input()
    
def rem_dup_adver_ever_oth_emoji(df):
    #remove duplicates
    r_d = input('Do you want to remove duplicates? [Press enter if no] ')
    if r_d in yes_resp:
        df = remove_duplicates(df) #return df; removes duplicates
        remove_dupl = 'r_d '
    else:
        remove_dupl = ''
    
    #remove advertisements
    r_a = input('Do you want to remove advertisements? [Press enter if no] ')
    if r_a in yes_resp:
        df = filter_records(df) #returns df; removes addvertisements
        remove_advertisements = 'r_a '
    else:
        remove_advertisements = ''

    # 110 OPTIONAL: This removes every other "None" record to reduce the total number of "None" rating. This is to make
    #the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
    #of responses.
    r_e_o = input('Do you want to remove every other neutral sentiment record: [Press enter if no] ')
    if r_e_o in yes_resp:
        df = remove_every_other(df) #returns df
        rem_every_other = 'r_e_o '
    else:
        rem_every_other = ''
    
    # remove emojis
    r_emoj = input('Do you want to remove emojis from the body records: [Press enter if no] ')
    if r_emoj in yes_resp:
        #print('location1')
        i = 0
        #print('location2')
        while i < len(df):
            #print('location3', i)
            string = df.loc[i, ('body')]
            #print('location4')
            #print('original string: ', string)
            new_string = remove_emoji(string)
            #print('location5')
            #print('new string: ', new_string)
            df.loc[i, ('body')] = new_string
            #print(df['body'][i])
        
            r_emoji = 'r_emoj '

            i += 1
    else:
        r_emoji = ''
        
    return df, r_emoji, rem_every_other, remove_advertisements, remove_dupl
    
# stopword removal from tweets
def remove_stopwords(df):
    s_w_r = input('Do you want to remove the Stopwords? [Press enter if no] ')
    if s_w_r in yes_resp:
    
        swords = 'r_stopwords ' # this is for the filename for the csv file
    
        #440 imports the nltk stopword list that holds the stopwords that will be removed from the text ('body.') 
        sw = set_up_nltk_stopword_removal() 

        #470 creates a list of new stopwords and then adds them to the set provided by nltk
        #Note  it is case sensitive
        #Input is the nltk stopword list ("stopWords")
        new_sw = add_new_stopwords(sw) 

        #480 This removes words from the list of stopwords and writes list to csv file
        # https //stackoverflow.com/questions/29771168/how-to-remove-words-from-a-list-in-python# ~ text=one%20more%20easy%20way%20to%20remove%20words%20from,%3D%20words%20-%20stopwords%20final_list%20%3D%20list%20%28final_list%29
        #new_words = list(filter(lambda w  w not in stop_words, initial_words))
        final_sw = remove_from_stopwords(new_sw, relevant_path) 
        #return stopWords

        #490 Checks to see of the words were removed from the stopWords list.
        #inputs  stopword list (sw) which is the output from remove_from_stopwords(sw); the word to be removed "no" 
        check_stopwords(final_sw, 'no') 

        #510 Removes stopwords from all the "body" text (tweets); to do this it must tokenize the string which means it must parse 
        # the string into individual words. It then compares the words with the words in the stopwords list and if there is not 
        # match it puts the word into the "wordsFiltered" list. It keeps appending to the list until all of the words are checked.
        # It then joins the individual words back into a string.
        #There is a difference between "deep" copy and "shallow" copy. "Deep" copy make a copy where the index and data are
        # separate from the original. "Shallow" copy is like a pointer where the two df share a common index and data
        #dfAPIScrubbed = dfAPI #This is a shallow copy
    
        dfs = rem_stopwords(df, final_sw) #removes the stopwords and empty body records and returns dfScrubbed

        #550 converts the scrubbed_compound scores into a 1 significant figure integer from a float number; rounding up
        # this is only needed if you are going to uses the 'scrubbed_compound' value as the label.
        #int_conversion(dfScrubbed) #return df

        # compares the first record (index = 0) raw data ("body" column) with scrubbed (stopwords removed) data
        #inputs  df - original df; dfs - scrubbed df (stopwords removed)
        c_o_w_s_r = input('Do you want to compare the original tweet with the stopwords removed tweet? ')
        if c_o_w_s_r in yes_resp:
            compare_scrubbed(df, dfs) 
    else:
        dfs = df
        swords = ''
        
    return dfs, swords

def vader_analysis(df): #performs Vader sentiment analysis and adds to df the compound binning and converts the stocktwits string value to a numerical value.
    df = vader_sentiment(df) #returns df; adds column with Vader sentiment values ('raw_compound') from the 'body' column.
    print('Produced Vader sentiment values.')

    df = compound_binning(df) #returns df; adds a column where the raw_compound scores are translated into 1, 0 or -1 'compound_bin'
    print('Completed the Vader compound binning.')

    df = convert_sentiment_to_numerical(df) #returns df
    print('Converted the Stocktwits sentiments to a numberical value (1,0,-1).')
    print('\nAll finished with the Vader sentiment analysis.\n')
        
    return df


MAIN

In [19]:

yes_resp = ['yes', 'YES', 'y', 'Y', 'Yes']
no_resp = ['no', 'NO', 'n', 'N', 'No']


relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files'

print('Here is a list of the csv files to choose from: \n')
list_dir_files(relevant_path) # gives all of the file options in the relevant path.

time.sleep(2)

name = input('\nWhat file do you want to use? \n')
df = getData(relevant_path + '/' + name) #returns df; reads csv file into df
print('Imported the csv file.')

df = remove_duplicate_headers(df)

if 'raw_compound' not in df.columns:
    before_scrubbing = input('Do you want to run the Vader analysis before scrubbing? \n')
    if before_scrubbing in yes_resp:
        vader_run = 'v_b '
        print('\nThis is the first time this file has been preprocessed.\n')
        print('Performing Vader sentiment analysis before scrubbing... \n')
    
        df = vader_analysis(df)
        
        df, r_emoji, rem_every_other, remove_advertisements, remove_dupl = rem_dup_adver_ever_oth_emoji(df)
        df, swords = remove_stopwords(df)
        
    else:
        vader_run = 'v_a '
        print('\n Performing Vader sentiment analysis after scrubbing... \n')
        
        df, r_emoji, rem_every_other, remove_advertisements, remove_dupl = rem_dup_adver_ever_oth_emoji(df)
        df, swords = remove_stopwords(df)
    
        df = vader_analysis(df)   
else:
    print('\nThis file has been preprocessed before. There is no need to run the VADER analysis.\n')


# 90 OPTIONAL Compares the Vader sentiment numbers with the Stocktwits sentiment ratings.
v_c = input('Do you want to compare the Vader sentiment numbers with the Stocktwits sentiment ratings? [Press enter if no] ')
if v_c in yes_resp:
        if 'raw_compound' in df.columns: #checks to see if this file have been prepocessed before by seeing if the column 'raw_compond' exists
            vader_correct(df) 

# 100 OPTIONAL: Counts how many "None" sentiment values are there for the stocktwits sentiment value
c_n_s = input('Do you want to count the "None" sentiment values for the Stocktwits sentiments before any edits? [Press enter if no] ')
if c_n_s in yes_resp:
    none_count_raw(df) 

# 115 OPTIONAL: Provides statistics on Stocktwits sentiments; bullish, none or bearish.
s_o_s = input('Do you want to see the statistics on the Stocktwits sentiments? [Press enter if no] ')
if s_o_s in yes_resp:
    stats(df) 

# 120 OPTIONAL: Allows user to manually input value when stocktwits sentiment value is "None"
# It counts every 20 edits and gives the user the option to quit. If the user chooses to quit
# it breaks from the while look and writes the df to a csv file so all work is saved up to that point.
# upon start up it ask if thie is the first time processing the raw data. If no it loads the csv file into
# the dataframe and starts where the previous session left off. If "modified?" is "Yes and "sentiment" is "None"
# it skips the record. Therefore it will re-start at the first "modified?" is "No" and "sentiment" is "None"

e_n = input('Do you want to edit the "None" records? [Press enter if no] ')
if e_n in yes_resp:
    df = edit(df) #returns df
    ed = 'edited '
else:
    ed = ''

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment values after the edit
n_r_a_e = input('Do you want to see how many "None" records there are after the edits? [Press enter if no] ')
if n_r_a_e in yes_resp:
    none_count(df) 

# 140 OPTIONAL: This will change the modified rating to the nltk rating only when they are opposite to see if it improves 
#the accuracy number 
# flip vader rating if opposite to stocktwits sentiment
f_v_r = input('Do you want to flip the Vader sentiment rating when it is the opposite of the Stocktwits sentiment rating? [Press enter if no] ')
if f_v_r in yes_resp:
    df = change_opp_nltk(df) #returns df

# 180 OPTIONAL: counts how many "None" sentiment values are there for the stocktwits sentiment value
n_c_a_e = input('Do you want to see the number of "None" sentiments after the edit? [Press enter if no] ')
if n_c_a_e in yes_resp:
    none_count(df) 

#### checks to see if there are any empty records
print('Test empty records before writing the csv file')
empty_records_check(df)

df = remove_empty_body_rows(df)

# Writes a csv file; input  df that is to be saved as a csv; output file name is combination of types of editing
w_csv = input('Do you want to write a csv file? [Press enter if no] ')
if w_csv in yes_resp:
    processed = 'preprocessed '
    # creates a file name that is a combination of all the different scrubbing types
    filename_output = processed + remove_dupl + remove_advertisements + rem_every_other + swords + ed + r_emoji + vader_run + name
    
    if name == filename_output: #Checks to see if the file already exists
        os.remove(filename_output) #If the file already exists it deletes the original file
        print('The old file was deleted.\n')
    
    write_csv(df, filename_output, relevant_path) #Writes the df to a new file
    print('The file was written with the filename of: ', filename_output, '\n')

    # NOTE TO SELF - When there is a record that has spaces only, it is encoded as a 'NaN' or empty record
    #when encoded as a utf-8 csv file. It will cause the postprocessing Vader app to crash. Importing the csv file
    #and then removing the 'NaN' and then rewriting the csv file should take care of the problem.

    final_name =  relevant_path + '/' + filename_output
    print('The filename is: \n', final_name)
    dftest = getData(final_name)
    print('csv file read into df to see if all of the empty records are removed.')
    empty_records_check(dftest)
    df_final = remove_empty_body_rows(dftest)
    empty_records_check(df_final)

    os.remove(final_name) #If the file already exists it deletes the original file
    write_csv(df_final, filename_output, relevant_path) #Writes the df to a new file

# combines two dfs
c_t_dfs = input('Do you want to combine two files? [Press enter if no] ')
if c_t_dfs in yes_resp:
    
    print('Here is a list of the csv files to choose from: \n')
    list_dir_files(relevant_path)
    first_name = input('\nWhat is the first file you want to combine? ')
    df = getData(relevant_path + '/' + first_name) #returns df; reads csv file into df
    print('Imported the csv file.')
    
    second_name = input('What is the second file you want to add? ')
    df2 = getData(relevant_path + '/' + second_name)

    # 650 Loads and combines two different dataframes in dfAPI; this is to combine two input datasets where the 'none'
    #values have been modified; this is to see if increased records will increase the accuracy of the model.
    df = combine_dfs(df1, df2)
    
    w_csv = input('Do you want to write a csv file? [Press enter if no] ')
    if w_csv in yes_resp:
        first_name_no_csv = first_name.replace('.csv', ' + ') 
        duo_name = first_name_no_csv + second
        write_csv(df, duo_name, relevant_path) #Writes the df to a new file
        print('The file was written with the filename of: ', duo_name, '\n')


print('\nAll done ....')

Here is a list of the csv files to choose from: 

Path:  C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files
2021-04-10 ARKG search stocktwits.csv
2021-04-11 ROST search stocktwits.csv
2021-04-11 TSLA search stocktwits.csv
2021-04-11 V search stocktwits.csv
2021-04-12 ACAD search stocktwits.csv
2021-04-12 EBAY search stocktwits.csv
2021-04-12 FB search stocktwits.csv
2021-04-12 INTC search stocktwits.csv
2021-04-12 OSTK search stocktwits.csv
2021-04-12 V search stocktwits.csv
2021-04-12 WKHS search stocktwits.csv
ARKG search stocktwits-Copy1.csv
preprocessed edited tech stockTwit 03112021.csv
preprocessed r_a tech stockTwit 03112021.csv
preprocessed r_d r_a r_e_o tech stockTwit 03112021.csv
preprocessed r_d r_a r_stopwords r_emoj tech stockTwit 03112021.csv
preprocessed r_d r_a r_stopwords tech stockTwit 03112021.csv
preprocessed r_d r_a tech stockTwit 03112021.csv
preprocessed r_d tech stockTwit 03112021.csv
preprocessed r_emoj tech stockTwit 03112021.csv

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pstri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179
The length of the stopword list is:  469
It did remove the words from the stopWords list!
Are there any empty body records?
(array([], dtype=int64),)
Do you want to compare the original tweet with the stopwords removed tweet? n
The number of clean records in the df are:  1616 

Produced Vader sentiment values.
Completed the Vader compound binning.
Converted the Stocktwits sentiments to a numberical value (1,0,-1).

All finished with the Vader sentiment analysis.

Do you want to compare the Vader sentiment numbers with the Stocktwits sentiment ratings? [Press enter if no] n
Do you want to count the "None" sentiment values for the Stocktwits sentiments before any edits? [Press enter if no] 
Do you want to see the statistics on the Stocktwits sentiments? [Press enter if no] 
Do you want to edit the "None" records? [Press enter if no] 
Do you want to see how many "None" records there are after the edits? [Press enter if no] 
Do you want to flip the Vader sentiment rating when it is the

In [12]:
print(df.columns)

Index(['symbol', 'created_at', 'body', 'followers', 'sentiment',
       'raw_compound', 'compound_bin', 'sentiment_number', 'modified_rating',
       'modified?'],
      dtype='object')


In [9]:
dftest = getData('preprocessed tech stockTwit 03112021.csv')
print(dftest.head())

  symbol            created_at  \
0   INTC  2021-03-05T21:01:03Z   
1   INTC  2021-03-05T21:01:03Z   
2   INTC  2021-03-05T21:00:02Z   
3   INTC  2021-03-05T20:51:14Z   
4   INTC  2021-03-05T20:06:56Z   

                                                body followers sentiment  \
0  $INTC Big Trade - $16 399 800.270 000 shares a...       862      None   
1  Large Print $INTC Size: 270000 Price: 60.74 Ti...      5502      None   
2  Huge Print $INTC Size: 4033477 Price: 60.74 Ti...      5502      None   
3               $AMD common follow ur sibs $INTC $MU        48   Bullish   
4                $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   

   raw_compound  compound_bin  sentiment_number  modified_rating modified?  
0        0.2960           1.0               0.0                0        No  
1        0.0000           0.0               0.0                0        No  
2        0.3182           1.0               0.0                0        No  
3        0.0000           0.0 

In [21]:
#df = df.reset_index(drop = True)
print(df)

     symbol            created_at  \
0      INTC  2021-03-05T20:51:14Z   
1      INTC  2021-03-05T20:06:56Z   
2      INTC  2021-03-05T19:57:20Z   
3      INTC  2021-03-05T19:52:43Z   
4      INTC  2021-03-05T19:36:13Z   
...     ...                   ...   
1286     MU  2021-02-24T13:22:35Z   
1287     MU  2021-02-24T12:48:31Z   
1288     MU  2021-02-24T12:38:21Z   
1289     MU  2021-02-24T12:10:44Z   
1290     MU  2021-02-24T12:10:09Z   

                                                   body followers sentiment  \
0                  $AMD common follow ur sibs $INTC $MU        48   Bullish   
1                   $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   
2     $INTC Should be thankful we are in this bull m...        21   Bullish   
3     @ButterFingerDROPs $INTC had its sell off back...        77      None   
4     $AMD At this rate  this will be left behind by...        11      None   
...                                                 ...       ...       ...   
12

In [10]:
# 100 counts how many "None" sentiment values are there for the stocktwits sentiment value
none_count_raw(df) 


The number of "None" stocktwits sentiment values is: 481
The percentage of "None" values is: 37.2 %


In [5]:
yes_resp = ['yes', 'YES', 'y', 'Y', 'Yes']
no_resp = ['no', 'NO', 'n', 'N', 'No']

test = input('do you want to test? ')
if test in yes_resp:
    print('yes I do')

do you want to test? Y
yes I do


In [11]:
name1 = 'output.csv'
remove_dupl = 'a '
remove_advertisements = 'b '
remove_every_other = ''
ed = 'd '

filename_output = remove_dupl + remove_advertisements + remove_every_other + ed + name1

print(filename_output)

a b d output.csv


In [3]:
#how to determine if column exists
import pandas as pd
 
df = pd.DataFrame([[10, 20, 30, 40], [7, 14, 21, 28], [55, 15, 8, 12]],
                  columns=['Apple', 'Orange', 'Banana', 'Pear'],
                  index=['Basket1', 'Basket2', 'Basket3'])
 
if 'apple' not in df.columns:
    print("in - no")
else:
    print("notin - yes")
 
 
if set(['Apple','Orange']).issubset(df.columns):
    print("Yes")
else:
    print("No")

in - no
Yes


In [68]:

def remove_duplicate_headers(df):
    column = 'symbol'
    df.drop(df[df['symbol'] == column].index, inplace=True)
    
    return df

print('Here is a list of the csv files to choose from: \n')
list_dir_files()
name = input('\nWhat file do you want to use? ')
df = getData(name) #returns df; reads csv file into df
print('Imported the csv file.')

print(df.head())

i = 0
while i < len(df):
    if df.iloc[i , 0] == "symbol":
        print('The index is: ', i)
    i += 1

print('starting to remove headers')
df = remove_duplicate_headers(df)
print('done removing headers')


i = 0
while i < len(df):
    if df.iloc[i , 0] == "symbol":
        print('The index is: ', i)
    i += 1
else:
    print('They are all gone!')
    
print(df.head())


Here is a list of the csv files to choose from: 

Path:  C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Preprocessing
preprocessed tech stockTwit 03112021.csv
stopwords.csv
tech stockTwit 03112021-Copy1 ORIGINAL DO NOT USE.csv
tech stockTwit 03112021.csv

What file do you want to use? tech stockTwit 03112021.csv
Imported the csv file.
  symbol            created_at  \
0   INTC  2021-03-05T21:01:03Z   
1   INTC  2021-03-05T21:01:03Z   
2   INTC  2021-03-05T21:00:02Z   
3   INTC  2021-03-05T20:51:14Z   
4   INTC  2021-03-05T20:06:56Z   

                                                body followers sentiment  
0  $INTC Big Trade - $16 399 800.270 000 shares a...       862      None  
1  Large Print $INTC Size: 270000 Price: 60.74 Ti...      5502      None  
2  Huge Print $INTC Size: 4033477 Price: 60.74 Ti...      5502      None  
3               $AMD common follow ur sibs $INTC $MU        48   Bullish  
4                $ITT $INTC $ADBE $OPTT $GLBS  .  .        575

In [12]:
import os

def remove_duplicate_headers(df):
    column = 'symbol'
    df.drop(df[df['symbol'] == column].index, inplace=True)
    
    return df

relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Post Processing'
included_extensions = ['csv']
file_names = [fn for fn in os.listdir(relevant_path)
              if any(fn.endswith(ext) for ext in included_extensions)]

for f in file_names:
    print(f)
    
name = input('What file do you want: ')
df = getData(relevant_path + '/' + name)

print(df.head(120))

print('before:')
empty = np.where(pd.isnull(df['body']))
print('empty')

df = remove_duplicate_headers(df)

df = df.fillna(value ={'body':' '}) #replaces any empty 'body' records with a space

print('after:')
np.where(pd.isnull(df['body']))

print(df.head(120))



What file do you want: preprocessed r_stopwords r_emoj tech stockTwit 03112021.csv


FileNotFoundError: [Errno 2] File C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Post Processing/preprocessed r_stopwords r_emoj tech stockTwit 03112021.csv does not exist: 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Post Processing/preprocessed r_stopwords r_emoj tech stockTwit 03112021.csv'

In [11]:
# finding and removing empty records in a df
dftest = getData(relevant_path + '/' + filename_output)

print(relevant_path + '/' + filename_output)
print('csv file read into df to see if all of the empty records are removed.')

#finds empty records
empty = np.where(pd.isnull(dftest['body'])) #checks to see if there are any empty records in the column 'body'; empty is a tuple where the first element is the array, the second is dtype of the array
if empty[0].size == 0:
    print('There are no empty records: \n', empty)
else:
    print('There are empty records: \n', empty, '\n')
    
print(dftest.iloc[110:125,])

#drops empty records
dftest.dropna(subset=['body'], inplace=True) #drops empty body records
dftest = dftest.reset_index(drop = True) # resets the index

empty = np.where(pd.isnull(dftest['body'])) #checks to see if there are any empty records in the column 'body'; empty is a tuple where the first element is the array, the second is dtype of the array
print('\nAFTER DROP: \n', empty, '\n')

print(dftest.iloc[110:125,])

# removes specific rows and resets the index
def remove_empty_body_rows(df):
    df.dropna(subset=['body'], inplace=True) #drops empty body records
    df = df.reset_index(drop = True) # resets the index
    return df




C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files/preprocessed r_stopwords r_emoj tech stockTwit 03112021.csv
csv file read into df to see if all of the empty records are removed.
There are no empty records: 
 (array([], dtype=int64),)
    symbol            created_at  \
110   INTC  2021-03-03T13:38:58Z   
111   INTC  2021-03-03T13:28:48Z   
112   INTC  2021-03-03T13:06:30Z   
113   INTC  2021-03-03T12:33:43Z   
114   INTC  2021-03-03T12:26:41Z   
115   INTC  2021-03-03T12:26:22Z   
116   INTC  2021-03-03T11:59:19Z   
117   INTC  2021-03-03T11:53:16Z   
118   INTC  2021-03-03T11:41:57Z   
119   INTC  2021-03-02T21:46:58Z   
120   INTC  2021-03-02T21:43:00Z   
121   INTC  2021-03-02T21:35:49Z   
122   INTC  2021-03-02T21:32:28Z   
123   INTC  2021-03-02T21:21:17Z   
124   INTC  2021-03-02T21:07:09Z   

                                                  body  followers sentiment  \
110  To Pay 2.18B Penalty To VLSI Tech For Patent I...       1819      None 

In [27]:
print(df['body'][10:20])

10    $INTC  didnt even flinch on  this market sell ...
11    $AMD $INTC $NVDA repost..https://m.hexus.net/t...
12    #russell1000  Earnings Ratings #mega.stocks: $...
13     $INTC this is a real  we just don’t tell others 
14    $INTC $intc i hope that they will be bankrupte...
15    $AMD I can&#39;t believe $MU and $INTC are hav...
16    $INTC keeping the portfolio respectable today ...
17    $VZ and $INTC LEAPS I added on dip are saving ...
18    LiquidTheta® Trade Alert (Delayed/Actionable)....
19    @TraderLeibniz @Stock__Twists @Jamz83 @Uncle_C...
Name: body, dtype: object


In [45]:
#removes emojis 

# Inports the csv file of choice
relevant_path = 'C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files'

print('Here is a list of the csv files to choose from: \n')
list_dir_files(relevant_path)
name = input('\nWhat file do you want to use? ')
df = getData(relevant_path + '/' + name) #returns df; reads csv file into df
print('Imported the csv file.')



def remove_emoji(string):
    import re
    import sys
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

i = 0

yes_dec = ['yes', 'y']

decision = input('decide: ')

if decision in yes_dec:
    i = 0
    while i < len(df):
        string = df.loc[i, ('body')]
        #print('original string: ', string)
        new_string = remove_emoji(string)
        #print('new string: ', new_string)
        df.loc[i, ('body')] = new_string
        #print(df['body'][i])

        i += 1
        
print('all done')

print(df.loc[13,'body'])


Here is a list of the csv files to choose from: 

Path:  C:/Users/pstri/OneDrive/Documents/Personal/Kokoro/NLTK/Code Project/Scraped Files
2021-04-10 ARKG search stocktwits.csv
2021-04-11 ROST search stocktwits.csv
2021-04-11 TSLA search stocktwits.csv
2021-04-11 V search stocktwits.csv
2021-04-12 ACAD search stocktwits.csv
2021-04-12 EBAY search stocktwits.csv
2021-04-12 FB search stocktwits.csv
2021-04-12 INTC search stocktwits.csv
2021-04-12 OSTK search stocktwits.csv
2021-04-12 V search stocktwits.csv
2021-04-12 WKHS search stocktwits.csv
ARKG search stocktwits-Copy1.csv
preprocessed edited tech stockTwit 03112021.csv
preprocessed r_a tech stockTwit 03112021.csv
preprocessed r_d r_a r_e_o tech stockTwit 03112021.csv
preprocessed r_d r_a r_stopwords tech stockTwit 03112021.csv
preprocessed r_d r_a tech stockTwit 03112021.csv
preprocessed r_d tech stockTwit 03112021.csv
preprocessed r_e_o tech stockTwit 03112021.csv
preprocessed r_stopwords tech stockTwit 03112021.csv
preprocessed te

In [35]:
print(df.loc[13,'body'])

$INTC this is a real 💎 we just don’t tell others 🤫


In [28]:
i = 0

while i < 2:
    #string = df.iloc[i,2]
    string = df.loc[i, ('body')]
    
    dat = df.loc[i, ('body')] 
    data = [dat] # 
    
    print('original string: ', dat)
    print(data)
    i += 1
    
    

original string:  $INTC Big Trade - $16 399 800.270 000 shares at $60.74
['$INTC Big Trade - $16 399 800.270 000 shares at $60.74']
original string:  Large Print $INTC Size: 270000 Price: 60.74 Time: 1601 Amount: $16 399 800.00
['Large Print $INTC Size: 270000 Price: 60.74 Time: 1601 Amount: $16 399 800.00']


In [7]:
#manipulating two names and then adding them together

first = 'first.csv'
second = 'second.csv'
first_no_csv = first.replace('.csv', ' + ') 

first_second = first_no_csv + second
print(first_second)

first + second.csv


In [6]:
negative = -1.9
rounding = int(negative)
print(rounding)

-1


In [33]:
# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.
def filter_records(df):
    import fnmatch

    data = []
    counter = 0
    advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

    for a in advert:
        i = 0
        df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
        while i < len(df):
            dat = df.loc[i, ('body')] # 2 represents the 'body' column
            data = [dat] # sets the string from the df into a list for the fnmatch.filter
            #print('index = ', i)
            filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
            #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

            if len(filtered) != 0: #if returns a True then record needs to be removed
                counter += 1
            
                df = df.drop(df.index[i]) # drops (deletes) the record
            
                #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)
                
                #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
                #5 becomes index number 4. Since the counter increments one more time it skips the record right after
                #the record that was just checked. That is why it takes multiple runs to remove all of the target
                #records. To correct this decrement the index, i, by
                
                i -= 1
    
            i += 1

    df = df.reset_index(drop = True) # resets the index; removes the gaps   
    len(df)
    return df

df = filter_records(df)


KeyError: 226

In [70]:
# 40 finds certain words in the strings ('body') and deletes the entire record.
#Note: When the record is deleted the df is re-indexed. The index for the while statement is not so the result is
#that the record right after the deleted record is skipped. To remedy the problem the index (i) for the while statement 
#is decremented by one.
#Also, the filtering terms are not case sensitive.

import fnmatch
df = df.reset_index(drop = True) # resets the index; removes the gaps  
data = []
counter = 0
advert = ['* sec *', '* daily News *', '*Huge Print*', '* Form *', '*SweepCast*', '*Large Print*', 
          '*Huge Print*', '*8-K*', '*SmartOptions*', '*Big Trade*', '*SEC Form*', '*Notice of Exempt*', 
          '*created_at*', '*stock news*', '*Trading Zones*', '*Entry:*', '*New Article*', '*ooc.bz*', 
          '*http*', 'Huge Trade', 'Trading is easy', 'www.', '#wallstreetbets', 'wallstreetbets',
          'Huge Trade', '#unitedtraders', 'stockbeep.com', 'Big Trade'] # words or phrases whose records are to be removed; It is not case sensitive.

for a in advert:
    i = 0
    df = df.reset_index(drop = True) # resets the index before each iteration; removes the gaps; resets len(df)
    while i < len(df):
        dat = df.loc[i, ('body')] # 2 represents the 'body' column
        #print('index =', i)
        #print(dat)
        #print(a)
        data = [dat] # sets the string from the df into a list for the fnmatch.filter
        #print('index = ', i)
        filtered = fnmatch.filter(data, a) # compares the information in the 'body' column with the 'advert' list; it places the matched items in the 'filtered' variable.
        #https://www.geeksforgeeks.org/fnmatch-unix-filename-pattern-matching-python/

        if len(filtered) != 0: #if returns a True then record needs to be removed
            counter += 1
            
            df = df.drop(df.index[i]) # drops (deletes) the record
            df = df.reset_index(drop = True) # resets the index; removes the gaps   
            #print('after the record is dropped:', df..log[i,('body')], 'i = ', i)
                
            #Note: When the record is dropped there is a change in the 'index' number. after the drop index number
            #5 becomes index number 4. Since the counter increments one more time it skips the record right after
            #the record that was just checked. That is why it takes multiple runs to remove all of the target
            #records. To correct this decrement the index, i, by
                
            i -= 1
   
        i += 1

df = df.reset_index(drop = True) # resets the index; removes the gaps   
len(df)



KeyError: 'body'

In [43]:
print(df.loc[340:350,:])

    symbol            created_at  \
340   INTC  2021-02-24T13:04:58Z   
341   INTC  2021-02-24T12:18:37Z   
342   INTC  2021-02-24T10:41:42Z   
343   INTC  2021-02-24T07:23:37Z   
344   INTC  2021-02-24T05:25:16Z   
346   INTC  2021-02-24T03:39:20Z   
347   INTC  2021-02-24T03:16:32Z   
348   INTC  2021-02-24T01:56:11Z   
349   INTC  2021-02-24T01:50:40Z   
350   INTC  2021-02-24T01:46:12Z   

                                                  body followers sentiment  \
340  $INTC AMD processors having failure rate and s...        55   Bullish   
341  $INTC https://wccftech.com/intel-rocket-lake-c...        55   Bullish   
342  $ES_F $QQQ $UVXY $TWTR $INTC ..According to Go...      4272      None   
343  $SNAP $PINS $INTC I find it interesting when p...         5   Bullish   
344                                           $INTC :)         1      None   
346  $INTC So what are they gonna do with chip fact...         6      None   
347  $INTC  Trading is easy with Buy and Short sign...   

In [71]:
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 0
    counter = 0
    df = df.reset_index(drop = True) #resets the index to be continuous 

    while i < len(df):
        print('index =', i, i % 2, df.loc[i,('sentiment')])

        if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                df.drop(df.index[i]) #drops (deletes) the record
                print('index =', i, df.loc[i,('sentiment')])
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'None':
            sentiment_number += 1
        i += 1

    print('\nThe total number of records is: ', len(df))
    print('The number of "None" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "None" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')

    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bullish':
            sentiment_number += 1
        i += 1

    print('The number of "Bullish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bullish" values is:', (int(sentiment_number/len(df) * 1000)/10), '%')
            
    i = 0
    sentiment_number = 0

    while i < len(df):
        if df.loc[i,('sentiment')] == 'Bearish':
            sentiment_number += 1
        i += 1

    print('The number of "Bearish" stocktwits sentiment values is:', sentiment_number)
    print('The percentage of "Bearish" values is:', (int(sentiment_number/len(df) * 1000)/10), '% \n')
            
    return df    

remove_every_other(df)

KeyError: 'sentiment'

In [52]:
print(df.loc[0:10, :])

   symbol            created_at  \
0    INTC  2021-03-05T20:51:14Z   
1    INTC  2021-03-05T20:06:56Z   
2    INTC  2021-03-05T19:57:20Z   
3    INTC  2021-03-05T19:52:43Z   
4    INTC  2021-03-05T19:44:47Z   
5    INTC  2021-03-05T19:36:13Z   
6    INTC  2021-03-05T19:27:49Z   
7    INTC  2021-03-05T19:14:58Z   
8    INTC  2021-03-05T17:59:24Z   
9    INTC  2021-03-05T17:55:23Z   
10   INTC  2021-03-05T17:54:32Z   

                                                 body followers sentiment  \
0                $AMD common follow ur sibs $INTC $MU        48   Bullish   
1                 $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   
2   $INTC Should be thankful we are in this bull m...        21   Bullish   
3   @ButterFingerDROPs $INTC had its sell off back...        77      None   
4   $INTC  Trading is easy with Buy and Short sign...       162      None   
5   $AMD At this rate  this will be left behind by...        11      None   
6   I sold all my $AMD shares and moved 

In [23]:
# 110 This removes every other "None" record to reduce the total number of "None" rating. This is to make
#the 'None' proportions more equal. It also prints the ratios of each sentiment response to the total number
#of responses.
def remove_every_other(df):
    i = 1
    counter = 0
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    print(len(df))

    while i < len(df):
        print('index =', i, i % 2, df.loc[i,('sentiment')])

        if df.loc[i,('sentiment')] == 'None': #column 4 is sentiment
            if i % 2 == 0: #identifies every even index where the sentiment is "None"
                print('inside :',i, i % 2)
                print(df.loc[i], '\n right before drop')
                df = df.drop(df.index[i]) #drops (deletes) the record
                df = df.reset_index(drop = True) #resets the index to be continuous 

                #df.drop([i]) #drops (deletes) the record
                print('index =', i, df.loc[i,('sentiment')])
            
        i += 1
    
    df = df.reset_index(drop = True) #resets the index to be continuous 
    
    print(len(df))
    
    return df    

remove_every_other(df)

2416
index = 1 1 None
index = 2 0 Bullish
index = 3 1 Bullish
index = 4 0 Bullish
index = 5 1 None
index = 6 0 None
inside : 6 0
symbol                                                           INTC
created_at                                       2021-03-05T19:44:47Z
body                $INTC  Trading is easy with Buy and Short sign...
followers                                                         162
sentiment                                                        None
raw_compound                                                   0.4404
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 6, dtype: object 
 right before drop
index = 6 None
index = 7 1 Bullish
index = 8 0 None
inside : 8 0
symbol                                                         

index = 139 1 Bullish
index = 140 0 None
inside : 140 0
symbol                                                           INTC
created_at                                       2021-03-01T17:54:42Z
body                Intel shares are trading higher in sympathy wi...
followers                                                        1825
sentiment                                                        None
raw_compound                                                   0.6705
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 140, dtype: object 
 right before drop
index = 140 None
index = 141 1 Bullish
index = 142 0 Bullish
index = 143 1 None
index = 144 0 None
inside : 144 0
symbol                                                           INTC
created_at     

index = 269 1 None
index = 270 0 Bullish
index = 271 1 None
index = 272 0 Bullish
index = 273 1 None
index = 274 0 None
inside : 274 0
symbol                                                           INTC
created_at                                       2021-02-24T03:16:32Z
body                $INTC  Trading is easy with Buy and Short sign...
followers                                                         164
sentiment                                                        None
raw_compound                                                   0.4404
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 274, dtype: object 
 right before drop
index = 274 Bullish
index = 275 1 None
index = 276 0 None
inside : 276 0
symbol                                         

Name: 390, dtype: object 
 right before drop
index = 390 None
index = 391 1 None
index = 392 0 None
inside : 392 0
symbol                                                           NVDA
created_at                                       2021-03-02T02:32:00Z
body                $NVDA  $GOOG and $DHI are selected by our stoc...
followers                                                       27000
sentiment                                                        None
raw_compound                                                   0.7717
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 392, dtype: object 
 right before drop
index = 392 None
index = 393 1 Bullish
index = 394 0 Bullish
index = 395 1 Bullish
index = 396 0 None
inside : 396 0
symbol                 

index = 529 1 Bullish
index = 530 0 None
inside : 530 0
symbol                              NVDA
created_at          2021-02-25T02:24:47Z
body                   $NVDA why the dip
followers                            534
sentiment                           None
raw_compound                           0
compound_bin                           0
sentiment_number                       0
modified_rating                        0
modified?                             No
Name: 530, dtype: object 
 right before drop
index = 530 Bullish
index = 531 1 None
index = 532 0 Bullish
index = 533 1 Bullish
index = 534 0 Bullish
index = 535 1 Bullish
index = 536 0 None
inside : 536 0
symbol                                                           NVDA
created_at                                       2021-02-25T02:12:19Z
body                @kendrickperkgoatPhD $NVDA Not a chance! If yo...
followers                                                           2
sentiment                                       

index = 654 None
index = 655 1 None
index = 656 0 None
inside : 656 0
symbol                                                           MSFT
created_at                                       2021-03-02T21:31:46Z
body                Large Print $MSFT Size: 454343 Price: 233.87 T...
followers                                                        5502
sentiment                                                        None
raw_compound                                                        0
compound_bin                                                        0
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 656, dtype: object 
 right before drop
index = 656 None
index = 657 1 Bearish
index = 658 0 None
inside : 658 0
symbol                                                           MSFT
created_at                                

inside : 794 0
symbol                                                    MSFT
created_at                                2021-02-25T03:02:57Z
body                @DoubleEE @nvestment $msft CONFIDENTIAL!!!
followers                                                  254
sentiment                                                 None
raw_compound                                                 0
compound_bin                                                 0
sentiment_number                                             0
modified_rating                                              0
modified?                                                   No
Name: 794, dtype: object 
 right before drop
index = 794 None
index = 795 1 None
index = 796 0 Bullish
index = 797 1 Bullish
index = 798 0 None
inside : 798 0
symbol                                                           MSFT
created_at                                       2021-02-25T02:47:58Z
body                $MSFT this past month  the monthly R

index = 944 None
index = 945 1 Bullish
index = 946 0 None
inside : 946 0
symbol                                                           TSLA
created_at                                       2021-03-02T21:54:13Z
body                $TSLA Grow Earth. It will take out all the gre...
followers                                                          37
sentiment                                                        None
raw_compound                                                  -0.6124
compound_bin                                                       -1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 946, dtype: object 
 right before drop
index = 946 Bullish
index = 947 1 Bearish
index = 948 0 None
inside : 948 0
symbol                              TSLA
created_at          2021-03-02T21:51:08Z
body                $TSL

Name: 1076, dtype: object 
 right before drop
index = 1076 Bearish
index = 1077 1 Bearish
index = 1078 0 Bullish
index = 1079 1 Bearish
index = 1080 0 Bearish
index = 1081 1 None
index = 1082 0 Bearish
index = 1083 1 Bearish
index = 1084 0 Bullish
index = 1085 1 Bearish
index = 1086 0 Bullish
index = 1087 1 Bullish
index = 1088 0 Bullish
index = 1089 1 Bullish
index = 1090 0 None
inside : 1090 0
symbol                                                           TSLA
created_at                                       2021-02-25T03:00:35Z
body                $TSLA  People are spamming this board with oth...
followers                                                          98
sentiment                                                        None
raw_compound                                                  -0.4767
compound_bin                                                       -1
sentiment_number                                                    0
modified_rating                          

index = 1204 Bullish
index = 1205 1 Bullish
index = 1206 0 None
inside : 1206 0
symbol                                                           ADBE
created_at                                       2021-03-03T02:22:47Z
body                $ADBE 2 Big Trades Today - $155.40M.333 050 sh...
followers                                                         862
sentiment                                                        None
raw_compound                                                    0.296
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1206, dtype: object 
 right before drop
index = 1206 None
index = 1207 1 None
index = 1208 0 None
inside : 1208 0
symbol                                                           ADBE
created_at                    

index = 1309 1 None
index = 1310 0 None
inside : 1310 0
symbol                                                           ADBE
created_at                                       2021-02-24T14:50:11Z
body                09:35:06 AM MINR Automated Entry.  Long 54 sha...
followers                                                         813
sentiment                                                        None
raw_compound                                                    0.296
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1310, dtype: object 
 right before drop
index = 1310 None
index = 1311 1 None
index = 1312 0 Bullish
index = 1313 1 Bearish
index = 1314 0 Bearish
index = 1315 1 Bullish
index = 1316 0 None
inside : 1316 0
symbol                          

index = 1440 Bullish
index = 1441 1 Bullish
index = 1442 0 Bullish
index = 1443 1 None
index = 1444 0 Bullish
index = 1445 1 Bullish
index = 1446 0 None
inside : 1446 0
symbol                                                            TSM
created_at                                       2021-03-02T01:30:41Z
body                $TSM 4 Big Trades Today - $170.42M.1 315 956 s...
followers                                                         862
sentiment                                                        None
raw_compound                                                    0.296
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1446, dtype: object 
 right before drop
index = 1446 Bullish
index = 1447 1 None
index = 1448 0 None
inside : 1448 0
symbol  

index = 1579 1 Bullish
index = 1580 0 None
inside : 1580 0
symbol                                                            TSM
created_at                                       2021-02-24T14:45:15Z
body                Wed Feb 24th.Today&#39;s WORST performing sect...
followers                                                        4599
sentiment                                                        None
raw_compound                                                  -0.7034
compound_bin                                                       -1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1580, dtype: object 
 right before drop
index = 1580 None
index = 1581 1 Bearish
index = 1582 0 None
inside : 1582 0
symbol                                                            TSM
created_at                                      

index = 1694 None
index = 1695 1 None
index = 1696 0 Bullish
index = 1697 1 Bullish
index = 1698 0 None
inside : 1698 0
symbol                                                             MU
created_at                                       2021-03-02T21:20:34Z
body                @Uncle_Covid @ShantP21 @denseanddumb @Jamz83 @...
followers                                                         548
sentiment                                                        None
raw_compound                                                    0.908
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1698, dtype: object 
 right before drop
index = 1698 None
index = 1699 1 None
index = 1700 0 None
inside : 1700 0
symbol                                                      

index = 1806 None
index = 1807 1 None
index = 1808 0 Bullish
index = 1809 1 None
index = 1810 0 None
inside : 1810 0
symbol                                                             MU
created_at                                       2021-02-25T19:25:17Z
body                $MU needs to clear VWAP 3rd times a charm then...
followers                                                          37
sentiment                                                        None
raw_compound                                                   0.6486
compound_bin                                                        1
sentiment_number                                                    0
modified_rating                                                     0
modified?                                                          No
Name: 1810, dtype: object 
 right before drop
index = 1810 None
index = 1811 1 Bullish
index = 1812 0 None
inside : 1812 0
symbol                                                      

Unnamed: 0,symbol,created_at,body,followers,sentiment,raw_compound,compound_bin,sentiment_number,modified_rating,modified?
0,INTC,2021-03-05T21:01:03Z,Large Print $INTC Size: 270000 Price: 60.74 Ti...,5502,,0.0000,0.0,0.0,0,No
1,INTC,2021-03-05T21:00:02Z,Huge Print $INTC Size: 4033477 Price: 60.74 Ti...,5502,,0.3182,1.0,0.0,0,No
2,INTC,2021-03-05T20:51:14Z,$AMD common follow ur sibs $INTC $MU,48,Bullish,0.0000,0.0,1.0,0,No
3,INTC,2021-03-05T20:06:56Z,$ITT $INTC $ADBE $OPTT $GLBS . .,575,Bullish,0.0000,0.0,1.0,0,No
4,INTC,2021-03-05T19:57:20Z,$INTC Should be thankful we are in this bull m...,21,Bullish,0.6996,1.0,1.0,0,No
...,...,...,...,...,...,...,...,...,...,...
1871,MU,2021-02-24T12:44:23Z,$MU 90 w ay to easy，let&#39;s see 100 this week,1,,0.0000,0.0,0.0,0,No
1872,MU,2021-02-24T12:42:03Z,$AMD $NVDA $INTC $MU $QCOM..BUY STHC reverse m...,191,Bullish,0.7249,1.0,1.0,0,No
1873,MU,2021-02-24T12:38:21Z,$MU Premarket looking promising bears can get...,55,Bullish,0.4574,1.0,1.0,0,No
1874,MU,2021-02-24T12:10:44Z,$MU DXI up over 3%,55,Bullish,0.0000,0.0,1.0,0,No


In [19]:
len(df)

2416

In [6]:
i = 0
df = df.drop(df.index[i]) #drops (deletes) the record
#print('index =', i, df.loc[i,('sentiment')])

print(df.loc[0:15,:])


   symbol            created_at  \
1    INTC  2021-03-05T21:01:03Z   
2    INTC  2021-03-05T21:00:02Z   
3    INTC  2021-03-05T20:51:14Z   
4    INTC  2021-03-05T20:06:56Z   
5    INTC  2021-03-05T19:57:20Z   
6    INTC  2021-03-05T19:52:43Z   
7    INTC  2021-03-05T19:44:47Z   
8    INTC  2021-03-05T19:36:13Z   
9    INTC  2021-03-05T19:27:49Z   
10   INTC  2021-03-05T19:14:58Z   
11   INTC  2021-03-05T18:34:39Z   
12   INTC  2021-03-05T18:05:15Z   
13   INTC  2021-03-05T17:59:24Z   
14   INTC  2021-03-05T17:55:23Z   
15   INTC  2021-03-05T17:54:32Z   

                                                 body followers sentiment  \
1   Large Print $INTC Size: 270000 Price: 60.74 Ti...      5502      None   
2   Huge Print $INTC Size: 4033477 Price: 60.74 Ti...      5502      None   
3                $AMD common follow ur sibs $INTC $MU        48   Bullish   
4                 $ITT $INTC $ADBE $OPTT $GLBS  .  .        575   Bullish   
5   $INTC Should be thankful we are in this bull m... 

In [67]:
#Create a DataFrame

import pandas as pd
import numpy as np

d = { 'Name':['Alisa','raghu','jodha','jodha','raghu','Cathrine', 'Alisa','Bobby','Bobby','Alisa','raghu','Cathrine'],
     'Age':[26,23,23,23,23,24,26,24,22,26,23,24], 
     'Score':[85,31,55,55,31,77,85,63,42,85,31,np.nan]}

df = pd.DataFrame(d,columns=['Name','Age','Score'])

df

df.drop([1,2])

df

Unnamed: 0,Name,Age,Score
0,Alisa,26,85.0
1,raghu,23,31.0
2,jodha,23,55.0
3,jodha,23,55.0
4,raghu,23,31.0
5,Cathrine,24,77.0
6,Alisa,26,85.0
7,Bobby,24,63.0
8,Bobby,22,42.0
9,Alisa,26,85.0
