In [20]:
# Import statements 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from textblob import TextBlob
import string
import datetime

In [9]:
# Load data
train_feats = pd.read_csv('train-dev-test/train_cleaned_features.csv', delimiter=',', encoding="utf-8")
dev_feats = pd.read_csv('train-dev-test/dev_cleaned_features.csv', delimiter=',', encoding="utf-8")
test_feats = pd.read_csv('train-dev-test/test_cleaned_features.csv', delimiter=',', encoding="utf-8")

# Clear whitespace
train_feats.columns = train_feats.columns.str.strip()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
# Dropping stuff!

# Drop all rows where time is NA?
print("Old length: ", len(train_feats))
train_feats = train_feats[train_feats['time'].notna()]
print("New length: ", len(train_feats))

# Just drop hashtags for now, too difficult for M3 to deal with
train_feats = train_feats.drop(['hashtags'], axis=1)

Old length:  3515562
New length:  3515538


In [12]:
# View the dataframes
train_feats.head(5)
#train_feats.columns                                        

Unnamed: 0,text,place,user_location,followers_count,retweet_count,favorite_count,weekday,month,day,year,...,hashtag_present,covid,vaccine,profanity_present,emoji_present,url_present,question_exclamation_present,sentiment,hour,minute
0,Pretty much any corona virus germ that wants t...,-1,26640,478,0,10,4,6,19.0,2020.0,...,0,1,0,0,0,1,0,2,19,52
1,RT @RealSaavedra: Good.\n\nIt came from China.,-1,33359,394,102,0,4,6,19.0,2020.0,...,0,0,0,0,0,0,0,2,19,52
2,RT @nicolebyer: Everyone in this looks sick,-1,41140,695,48499,0,4,6,19.0,2020.0,...,0,0,0,0,0,0,0,0,19,52
3,RT @jabinbotsford: Close up of President @real...,-1,-1,19,30492,0,4,6,19.0,2020.0,...,0,1,0,0,0,0,0,1,19,52
4,RT @Biancaixvi: Corona day 3: it just feels li...,-1,15816,392,85302,0,4,6,19.0,2020.0,...,0,1,0,0,0,0,0,1,19,52


In [13]:
train_feats.dtypes

text                             object
place                              int8
user_location                     int32
followers_count                  object
retweet_count                    object
favorite_count                   object
weekday                            int8
month                              int8
day                             float64
year                            float64
num_present                       int64
trump_present                     int64
hashtag_present                   int64
covid                             int64
vaccine                           int64
profanity_present                 int64
emoji_present                     int64
url_present                       int64
question_exclamation_present      int64
sentiment                          int8
hour                              int64
minute                            int64
dtype: object

In [11]:
# Feature manipulation to make them ready for the model
# Change sentiment labels to numbers!
train_feats['sentiment'] = train_feats['sentiment'].astype('category')
train_feats["sentiment"] = train_feats["sentiment"].cat.codes

# Change location
train_feats["user_location"] = train_feats["user_location"].astype('category')
train_feats["user_location"] = train_feats["user_location"].cat.codes

# Change place
train_feats["place"] = train_feats["place"].astype('category')
train_feats["place"] = train_feats["place"].cat.codes

# Change weekday
train_feats["weekday"] = train_feats["weekday"].astype('category')
train_feats["weekday"] = train_feats["weekday"].cat.codes

# Change month
train_feats["month"] = train_feats["month"].astype('category')
train_feats["month"] = train_feats["month"].cat.codes

# Change time to datetime object
train_feats['time'] = train_feats['time'].apply(lambda x: datetime.datetime.strptime(str(x), '%H:%M:%S'))

# Add hour and minute columns
train_feats['hour'] = train_feats['time'].apply(lambda x: x.hour)
train_feats['minute'] = train_feats['time'].apply(lambda x: x.minute)

# Drop time
train_feats = train_feats.drop(['time'], axis=1)

# Change all the bool types to numeric
train_feats["num_present"] = train_feats["num_present"].astype(int)
train_feats["trump_present"] = train_feats["trump_present"].astype(int)
train_feats["hashtag_present"] = train_feats["hashtag_present"].astype(int)
train_feats["covid"] = train_feats["covid"].astype(int)
train_feats["vaccine"] = train_feats["vaccine"].astype(int)
train_feats["profanity_present"] = train_feats["profanity_present"].astype(int)
train_feats["emoji_present"] = train_feats["emoji_present"].astype(int)
train_feats["url_present"] = train_feats["url_present"].astype(int)
train_feats["question_exclamation_present"] = train_feats["question_exclamation_present"].astype(int)

In [14]:
# Drop text (JUST FOR A TEST)
train_feats = train_feats.drop(['text'], axis=1)

In [25]:
# Haven't done text stuff yet!

print(test_feats.head(1)['text'][0])
blob = TextBlob(test_feats.head(1)['text'][0])
print(blob.tags)           # [('The', 'DT'), ('titular', 'JJ'),
                    #  ('threat', 'NN'), ('of', 'IN'), ...]

print(blob.noun_phrases)   # WordList(['titular threat', 'blob',
                    #            'ultimate movie monster',
                    #            'amoeba-like mass', ...])

for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

@Jeff_Reinebold No corona out there, that’s for sure. Enjoy
[('@', 'NNS'), ('Jeff_Reinebold', 'NNP'), ('No', 'NNP'), ('corona', 'NN'), ('out', 'IN'), ('there', 'RB'), ('that', 'IN'), ('’', 'NNP'), ('s', 'NN'), ('for', 'IN'), ('sure', 'JJ'), ('Enjoy', 'NN')]
['jeff_reinebold', '’ s', 'enjoy']
0.5
0.4


In [15]:
# Get data ready
# Get sentiment
labels = train_feats['sentiment']
# Drop labels from larger dataframee
train_feats = train_feats.drop(['sentiment'], axis=1)

# Convert both to numpy arrays
X = train_feats.to_numpy()
y = labels.to_numpy()

In [19]:
# Instantiate model - Multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Train model
model.fit(X, y)

LogisticRegression(multi_class='multinomial')

In [None]:
y_preds = model.predict(X)
# Print score
print(accuracy_score(y_preds, y))