In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

See https://github.com/KWolley/CSPB4502_UFO_PresidentialElections for full project.
Using some data sets built by teammates that cleaned ufo-sightings and merging with a us elections data set: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import datetime

In [None]:
ufo_data = pd.read_csv('../input/ufo-and-elections/merged_ufo_elect_data.csv', low_memory=False)

In [None]:
filt_objs = []
for i, obj in ufo_data.iterrows():
    if pd.notna(obj["year"]) and pd.notna(obj["totalvotes"]) and pd.notna(obj["state_x"]):
        filt_objs.append(obj)

In [None]:
df = pd.DataFrame(filt_objs)

In [None]:
states = set()
for i, obj in df.iterrows():
    states.add(obj["state_x"])
    
years = set()
for i, obj in df.iterrows():
    years.add(obj["year"])

In [None]:
year_sums = []
years = sorted(list(years))
for year in years:
    year_sum = 0
    state_dic = set()
    for i, obj in df.iterrows():
        if int(obj["year"]) == int(year):
            if obj["state_x"] not in state_dic and pd.notna(obj["totalvotes"]):
                year_sum += int(obj["totalvotes"])
                state_dic.add(obj["state_x"])
                
    year_sums.append(year_sum)

In [None]:
year_counts = {}

sightings = []

for i, obj in ufo_data.iterrows():
    if pd.notna(obj["date"]):
        sightings.append(obj)

accounts = pd.DataFrame(sightings)

In [None]:
for i, obj in accounts.iterrows():
    if obj["year"] not in year_counts.keys():
        year_counts[obj["year"]] = 1
    else:
        year_counts[obj["year"]] += 1

In [None]:
tups = sorted(year_counts.items()) 
# print(tups)
x, y = zip(*tups)
plt.rcParams["figure.figsize"] = (18,7)
plt.xlabel("frequency of sightings by last election year", fontsize=20)
plt.bar(x, y, color=["green", "blue"], alpha=.8, width=1)

In [None]:
# data from https://en.wikipedia.org/wiki/Voter_turnout_in_the_United_States_presidential_elections#:~:text=Note%3A%20The%20Bipartisan%20Policy%20Center,62.3%25%3B%20and%202012%2057.5%25.
turnout = [ (1944,  48026000), (1948,  48834000), (1952,61552000),
           (1956,62027000), (1960, 68836000), (1964,70098000), (1968, 73027000),
           (1972, 77625000), (1976, 81603000), (1980, 86497000),(1984, 92655000),
           (1988, 91587000), (1992,104600000), (1996,96390000), (2000, 105594000),
           (2004, 122349000), (2008, 131407000), (2012, 129235000), (2016, 138847000)
          ]

year_turn, num_turn = zip(*turnout)

In [None]:
plt.rcParams["figure.figsize"] = (14,8)
plt.xlabel("election turnout by year", fontsize=20)
plt.bar(year_turn, num_turn, color=["red", "white", "blue"], width=3, edgecolor="red", alpha=.7)

In [None]:
visits = []
for tup in tups:
    if tup[0] in year_turn:
        visits.append(tup[1])

In [None]:
elect = []
i = 0
while i < len(visits):
    row = [visits[i], year_turn[i], num_turn[i]]
    elect.append(row)
    i += 1
    
elect_df = pd.DataFrame(elect, columns=["visits", "year", "turnout_total"])

In [None]:
elect_df

In [None]:
corelation = elect_df.corr()
print(corelation)

In [None]:
shape_counts = {} # cleaning out junk
for i, obj in accounts.iterrows():
    if obj["shape"] != "other" and obj["shape"] != "nan" and obj["shape"] != "unknown" and obj["shape"] != "changing" and type(obj["shape"]) == str:
        if obj["shape"] not in shape_counts.keys():
            shape_counts[obj["shape"]] = 1
        else:
            shape_counts[obj["shape"]] += 1

# print(shape_counts)
shape_x = list(shape_counts.keys())
shape_y = list(shape_counts.values())
shape_x = [str(shape) for shape in shape_x]
# print(shape_y)
# print(shape_x)
plt.rcParams["figure.figsize"] = (22,6)
plt.xticks(rotation=90, fontsize=32)
plt.ylabel("sightings", fontsize=28)
plt.bar(shape_x, shape_y, alpha=.7)
plt.savefig("sightings_shape_hist.png")

### Time to aggregate like shapes

In [None]:
circular = ["circle", "sphere", "egg", "oval", "round", "disk"]
triangular_pointed = ["triangle", "delta", "diamond", "pyramid", "chevron"]
light_based = ["light", "fireball", "flash", "flare"]

def shape_map(shape): # grouping like shapes 
    if shape in circular:
        return "circular"
    elif shape in triangular_pointed:
        return "triangular/pointed"
    elif shape in light_based:
        return "light based"
    else:
        return shape
    

shape_year = {}
for i, obj in accounts.iterrows():
    if obj["shape"] != "other" and obj["shape"] != "nan" and obj["shape"] != "unknown" and obj["shape"] != "changing" and type(obj["shape"]) == str:
        if obj["year"] not in shape_year.keys():
            shape_year[int(obj["year"])] = {shape_map(obj["shape"]) : 1}
        elif shape_map(obj["shape"]) not in shape_year[int(obj["year"])].keys():
            shape_year[int(obj["year"])][shape_map(obj["shape"])] = 1
        else:
            shape_year[int(obj["year"])][shape_map(obj["shape"])] += 1


shape_counts_grouped = {}
for i, obj in accounts.iterrows():
    if obj["shape"] != "other" and obj["shape"] != "nan" and obj["shape"] != "unknown" and obj["shape"] != "changing" and type(obj["shape"]) == str:
        if shape_map(obj["shape"]) not in shape_counts_grouped.keys():
            shape_counts_grouped[shape_map(obj["shape"])] = 1
        else:
            shape_counts_grouped[shape_map(obj["shape"])] += 1
            
shape_x = list(shape_counts_grouped.keys())
shape_y = list(shape_counts_grouped.values())
shape_x = [str(shape) for shape in shape_x]
# print(shape_y)
# print(shape_x)
plt.rcParams["figure.figsize"] = (22,6)
plt.xticks(rotation=90, fontsize=32)
plt.ylabel("sightings", fontsize=28)
plt.bar(shape_x, shape_y, alpha=.7)
plt.savefig("sightings_shape_hist.png")

In [None]:
import collections
ordered_by_year = collections.OrderedDict(sorted(shape_year.items()))


df_shape_year = pd.DataFrame(ordered_by_year)
df_shape_year.head()

In [None]:
# cleaning
df_shape_year = df_shape_year.fillna(0)

In [None]:

import seaborn as sns
plt.rcParams["figure.figsize"] = (18,12)
sns.set(font_scale=1.8)
ax = sns.heatmap(df_shape_year.loc[:,1952:],cmap='viridis', robust=True)
plt.savefig("shape_year_heatmap.jpg")

In [None]:
## dividing each shape counts per year by that year's overall sightings
for i, obj in df_shape_year.iterrows():
    #print(obj)
    for j, count in enumerate(obj):
        year = df_shape_year.columns[j]
        
        counts_for_year = year_counts[year]
        df_shape_year.loc[i, df_shape_year.columns[j]] = count/counts_for_year

In [None]:
plt.rcParams["figure.figsize"] = (18,12)
sns.set(font_scale=1.8)
ax = sns.heatmap(df_shape_year.loc[:,1960:],cmap='viridis', vmax=.4)
plt.savefig("shape_year_heatmap.jpg")


Plotting the heatmap by fraction of all frequencies for a given year seems to show that 
circular stops being the dominant shape in the mid/late 90s, and light based becomes more popular

- Are there other pattern shifts around that time?

Lets try classifying the comments attribute which describes ufo sightings to see if there is a difference between pre and post mid 90s comments


In [None]:
import numpy as np
import random
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import datetime
import nltk
import re
import string
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from wordcloud import WordCloud

df = pd.read_csv('../input/ufo-and-elections/samsclean.csv')

In [None]:
df.head()

In [None]:
def year_pre_post(x):
        if x < 1996:
            return "pre 96"
        else:
            return "post 96"

In [None]:
df["last_election_year"] = df["last_election_year"].apply(lambda x: year_pre_post(x))

In [None]:
#cleaning and removing stop words

    
df.head()
nltk.download('stopwords')
stop_words = stopwords.words("english")

def text_preproc(x):
    if type(x) == str:
        # filtering out stop words
        x = x.encode('ascii', 'ignore').decode()
        x = ' '.join([word for word in x.split(' ') if word not in stop_words and word != "pd"])
        
        return x
    else:
        return ""

df['comments'] = df['comments'].apply(lambda x: text_preproc(x))  
df['comments'] = df['comments'].apply(lambda x: text_preproc(x))
    
df.head()

Lets make a training and test set, and vectorize the comments

In [None]:
vectorizer = TfidfVectorizer(stop_words='english') # suspiciously accurate
#vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df["comments"])

In [None]:


training_set = df.sample(frac = 0.8) 

test_set = df.drop(training_set.index) 


from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score, f1_score, classification_report

training_X = vectorizer.transform(training_set["comments"])
test_vect = vectorizer.transform(test_set["comments"]) # vectorizing train and test comments




Lets try naive bayes

In [None]:
clf_year = MultinomialNB().fit(training_X, training_set["last_election_year"])
predicted_year = clf_year.predict(test_vect)

print(predicted_year)
pre_predictions = 0
for year in predicted_year:
    if year == "pre 96":
        pre_predictions += 1
print(pre_predictions)


print(classification_report(test_set["last_election_year"], predicted_year))

NOT GOOD! due to a class imbalance issue where we have many more post 96 comments than pre 96 our model learned to just always predict that a comment is post 96. Lets try some other classifiers

In [None]:
# with svm
# LinearSVC
clf_year_svm = LinearSVC().fit(training_X, training_set["last_election_year"])
predicted_year_svm = clf_year_svm.predict(test_vect)

print(predicted_year_svm)
pre_predictions_svm = 0
for year in predicted_year_svm:
    if year == "pre 96":
        pre_predictions_svm += 1
print(pre_predictions_svm)


print(classification_report(test_set["last_election_year"], predicted_year_svm))

A little better, but not very good pre 96 recall

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=200, max_depth=8).fit(training_X, training_set["last_election_year"])

predicted_year_gb = gb_clf.predict(test_vect)



print(classification_report(test_set["last_election_year"], predicted_year_gb))

Not so hot. But f score for sgd and svc are both above .5 and wordclouds seem to indicate a difference in pre and post 95 comments.

Wordclouds:

In [None]:
pre_95_comments = ""
post_95_comments = ""
num_pre = 0
num_post = 0

for i, obj in df.iterrows():
    if obj["last_election_year"] == "pre 96":
        pre_95_comments +=  obj["comments"]
        num_pre += 1
    else:
        num_post += 1
        post_95_comments += obj["comments"]
        
##### pre_95_comments = re.sub(r'lights', ' ', pre_95_comments)
pre_95_comments = ' '.join([word for word in pre_95_comments.split(' ')])
post_95_comments = ' '.join([word for word in post_95_comments.split(' ')])

wordcloud_pre = WordCloud(max_words=20).generate(pre_95_comments) 
print("pre", num_pre)
print("post", num_post)

plt.imshow(wordcloud_pre) 

Above is a pre 95 comments wordcloud
Below is post 95

In [None]:
wordcloud_post = WordCloud(max_words=20).generate(post_95_comments) 
plt.imshow(wordcloud_post)

"Light", "Sky" and "Object" become much more popular post 1995 than beforehand. Thats kind of interesting because those seem to be among the most vague descriptors for ufo sightings. For more conclusions, and analyses see the rest of our project at: https://github.com/KWolley/CSPB4502_UFO_PresidentialElections 