In [None]:
import pandas as pd
import numpy as np

import re
import nltk

import plotly.graph_objects as go
import plotly_express as px
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

import matplotlib.pyplot as plt
from wordcloud import WordCloud

from PIL import Image
import requests
from io import BytesIO

init_notebook_mode()

![](https://i.imgur.com/edaRFkX.png)

# The Data

The dataset is provided by [Neelima Jauhari](https://www.kaggle.com/nilimajauhari). 

It contains the Lunar Anamolies observed between the year 1540 and 1967. 

In [None]:
df = pd.read_csv('../input/500-years-of-mysterious-lunar-anomalies/Lunar Anomalies.csv')
df.head()

In [None]:
print(f'There are {len(df)} number of observations in the dataset')

# Exploration through Time

In [None]:
yearlyObservation = df.groupby('Year').size().cumsum().reset_index()
yearlyObservation.columns = ['Year','Total Observations']
fig = px.line(yearlyObservation, x = 'Year', y='Total Observations', title = 'Anamolies Observed over the Year', template = 'plotly_dark')
fig.show()

In [None]:
firstObservation = df.iloc[0,:]
print(f"The First Lunar Anamoly in our dataset was observed {firstObservation['Day']} day of {firstObservation['Month']} in the year {firstObservation['Year']} by {firstObservation['Credit']}. The Description of this anamoly is - {firstObservation['Description']} and the location is {firstObservation['Location']}")

In [None]:
latestObservation = df.iloc[-1,:]
print(f"The Latest Lunar Anamoly in our dataset was observed {latestObservation['Day']} day of {latestObservation['Month']} in the year {latestObservation['Year']} by {latestObservation['Credit']}. The Description of this anamoly is - {latestObservation['Description']} and the location is {latestObservation['Location']}")

In [None]:
monthlyObservation = df['Month'].value_counts().reset_index()
monthlyObservation.columns = ['Month', 'Number of Observation']
fig = px.bar(monthlyObservation, x = 'Number of Observation', y = 'Month',
             template = 'plotly_white', orientation = 'h', text = 'Number of Observation',
             title = 'Number of Observations Each Month',
            category_orders={'Month' : ['Jan','Feb','Mar','Apr','May','Jun',
                                       'Jul','Aug','Sep','Oct','Nov','Dec']})
fig.show()

In [None]:
dailyObservation = df['Day'].value_counts().reset_index().sort_index(ascending = False)
dailyObservation.columns = ['Day', 'Number of Observation']
fig = px.bar(dailyObservation, y = 'Number of Observation', x = 'Day', template = 'ggplot2', orientation = 'v', text = 'Number of Observation',title = 'Number of Observations Each Day')
fig.show()

We see that October has the most number of Observations and we also observe a large number of Anamolies on the 19th day of the month. 

Given below is a table containing the anamolies we observe on the 19th of October

In [None]:
oct19 = df[(df['Day'] == 19) & (df['Month'] == 'Oct')]
oct19.style.background_gradient(cmap='Blues')

We can see some problems in the credit section of the dataset. Will work on them in the later versions of the notebook. Stay tuned

# Exploration Across Location

In [None]:
def fixLocation(loc):
    if('aristarchus' in loc.lower()):
        return 'Aristarchus'
    elif('gassendi' in loc.lower()):
        return 'Gassendi'
    elif(loc.lower() == 'dark side'):
        return 'Dark Side'
    elif('plato' in loc.lower()):
        return 'Plato'
    elif(loc == 'nan'):
        return 'Unknown'
    elif('alphonsus' in loc.lower()):
        return 'Alphonsus'
    else:
        return loc
    
df['Location'] = df['Location'].apply(lambda x:fixLocation(str(x)))

In [None]:
locCounts = df['Location'].value_counts().reset_index()
locCounts.columns = ['Location', 'Observations']

fig = px.bar(locCounts[:20], x = 'Observations', y = 'Location', orientation = 'h', color = 'Observations',
             color_continuous_scale = 'Blugrn', text = 'Observations', title = '<b>Top 20 Locations With the Most Observed Anamolies</b>')
fig.show()

We see a large number of Lunar Anamolies near or around Aristarchus. 

But wait a minute? Wasn't Aristarchus a Greek astronomer? Let's see what Wikipedia has to say about this. 

<div class="alert alert-success" role="alert">
    Aristarchus, named after the Greek astronomer Aristarchus of Samos, is a prominent lunar impact crater that lies in the northwest part of the Moon's near side. It is considered the brightest of the large formations on the lunar surface, with an albedo nearly double that of most lunar features. The feature is bright enough to be visible to the naked eye, and displays unusually bright features when viewed through a large telescope. It is also readily identified when most of the lunar surface is illuminated by earthshine. The crater is deeper than the Grand Canyon.
    
    
The crater is located at the southeastern edge of the Aristarchus plateau, an elevated area that contains a number of volcanic features, such as sinuous rilles. This area is also noted for the large number of reported transient lunar phenomena, as well as recent emissions of radon gas as measured by the Lunar Prospector spacecraft. 
</div>

# I saw it first

In [None]:
creditDf = df[(df['Credit'] != 'E') & (df['Credit'] != 'base inner W wall')]
credits = creditDf['Credit'].value_counts().reset_index()
credits.columns = ['Credited to', 'Number of Observations']
fig = px.bar(credits[:20] , x ='Number of Observations' , y = 'Credited to', orientation = 'h', 
            title = 'Top 20 Names with Most Credits',text = 'Number of Observations',
             color = 'Number of Observations', color_continuous_scale='Peach',
            )
fig.show()

While I could not find any Wikipedia reference to Bartlett,here's something about the others

<div class="alert alert-success" role="alert">
Johann Hieronymus Schröter (30 August 1745, Erfurt – 29 August 1816, Lilienthal) was a German astronomer. 
# The lunar crater Schröter and the Martian crater Schroeter are named after him, as is Vallis Schröteri (Schröter's Valley) on the Moon. 
</div>

<div class="alert alert-success" role="alert">
Sir Patrick Alfred Caldwell-Moore CBE HonFRS FRAS (4 March 1923 – 9 December 2012) was an English amateur astronomer who attained prominence in that field as a writer, researcher, radio commentator and television presenter.

Moore was president of the British Astronomical Association; co-founder and president of the Society for Popular Astronomy; author of over seventy books on astronomy; and presenter of the world's longest-running television series with the same original presenter, BBC's The Sky at Night (from 1957). He became known as a specialist in Moon observation and for creating the Caldwell catalogue. Idiosyncrasies such as his rapid diction and monocle made him a popular and instantly recognisable figure on British television. 
</div>

<div class="alert alert-success" role="alert">
Baron Franz von Paula (Franciscus de Paula) Gruithuisen (March 19, 1774 – June 21, 1852) was a Bavarian physician and astronomer. He taught medical students before becoming a professor of astronomy at the University of Munich in 1826. 
    
Like others before and since his time, Gruithuisen believed that the Earth's moon was habitable. He made multiple observations of the lunar surface that supported his beliefs, including his announcement of the discovery of a city in the rough terrain to the north of Schröter crater he named the Wallwerk. This region contains a series of somewhat linear ridges that have a fishbone-like pattern, and, with the small refracting telescope he was using, could be perceived as resembling buildings complete with streets. He published his observations in 1824, but they were greeted with much skepticism by other astronomers of the time. His claims were readily refuted using more powerful instruments.

He is also noted for the discovery of bright caps on the cusps of the crescent Venus, and for being the first to suggest that craters on the Moon were caused by meteorite impacts. He proposed that jungles on Venus grew more rapidly than in Brazil due to the proximity of the planet to the Sun, and that as a consequence the inhabitants celebrated fire festivals— the cause of the bright caps on Venus. 
    
</div>

Interestingly, all 3 of them have Craters named after them.

# What did I see?

In [None]:
corpus = ' '.join(df['Description'].fillna('').values)
corpus = corpus.lower()
corpus = re.sub(r'[,.!*):(<]', '', corpus)
corpus = re.sub(r'\s+',' ',corpus)

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
tokens = nltk.word_tokenize(corpus)
tokensWithoutStopwords = [token for token in tokens if not token in stopwords]

In [None]:
biGrams = nltk.bigrams(tokensWithoutStopwords)
freqBiGrams = nltk.FreqDist(biGrams)
tenFreqBiGrams = {' '.join(bigram[0]) : bigram[1] for bigram in freqBiGrams.most_common(10)}
tenFreqBiGrams = pd.DataFrame(tenFreqBiGrams.items(), columns=['Bigram', 'Frequency'])
fig = px.bar(tenFreqBiGrams, x = 'Bigram', y = 'Frequency', title = 'Ten Most Frequent Bigrams', 
             color = 'Frequency', color_continuous_scale='Gray', template = 'plotly_dark')
fig.show()

In [None]:
triGrams = nltk.trigrams(tokensWithoutStopwords)
freqtriGrams = nltk.FreqDist(triGrams)
tenFreqtriGrams = {' '.join(trigram[0]) : trigram[1] for trigram in freqtriGrams.most_common(10)}
tenFreqtriGrams = pd.DataFrame(tenFreqtriGrams.items(), columns=['Trigram', 'Frequency'])
fig = px.bar(tenFreqtriGrams, x = 'Trigram', y = 'Frequency', title = 'Ten Most Frequent Trigrams', 
             color = 'Frequency', color_continuous_scale='Gray',template = 'plotly_dark')
fig.show()

In [None]:
response = requests.get('https://st.depositphotos.com/1724125/2605/v/950/depositphotos_26050193-stock-illustration-cartoon-moon.jpg')
img = Image.open(BytesIO(response.content))
mask = np.asarray(img)

plt.subplots(figsize=(12,8))
wordcloud = WordCloud(background_color='Black',
                      mask = mask,
                      contour_color='White', contour_width=5, 
                      stopwords=stopwords,
                      width=1500, margin=10,
                      height=1080
                     ).generate(corpus)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Most of the anamolies that are observed have been about the presence of some sort of bright spot or a spot that has some different color visible like blue or violet.

There also have been a quite a few anamoly descriptions with mentions of Volcanic Activities, lets try to look at them aswell

In [None]:
volcanicAnamoly = df[(df['Description'].fillna('').str.contains('volcano|Volcano|volcanic|Volcanic'))]

In [None]:
monthlyObservation = volcanicAnamoly['Month'].value_counts().reset_index()
monthlyObservation.columns = ['Month', 'Number of Observation']
yearlyObservation = volcanicAnamoly.groupby('Year').size().cumsum().reset_index()
yearlyObservation.columns = ['Year', 'Number of Observation']

fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles=['Volcanic Observations over the Years','Volcanic Observations Each Month'],
                   horizontal_spacing=0.1)

trace0 = go.Scatter(x = yearlyObservation['Year'], y = yearlyObservation['Number of Observation'], mode='lines',name = 'Yearly Observations')
fig.add_trace(trace0, row = 1, col = 1)

trace1 = go.Bar(x = monthlyObservation['Number of Observation'], y = monthlyObservation['Month'], orientation='h', name = 'Monthly Observations')
fig.add_trace(trace1, row = 1, col = 2)

fig.update_layout(showlegend = False, template = 'seaborn')
fig.show()

There has been a total of 16 Volcanic Anamolies from 1787 to 1898. 

Volcanic Anamolies have been seen more frequently in the month on April. More importantly, most of the observations are seen from March to July. 

In [None]:
observers = volcanicAnamoly['Credit'].value_counts().reset_index()
observers.columns = ['Observer', 'Number of Observation']
locations = volcanicAnamoly['Location'].value_counts().reset_index()
locations.columns = ['Location', 'Number of Observation']

fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles=['Volcanic Anamolies Locations','Volcanic Anamolies Observers'],
                   horizontal_spacing=0.2)

trace0 = go.Bar(x = locations['Number of Observation'], y = locations['Location'], orientation='h', name = 'Location')
fig.add_trace(trace0, row = 1, col = 1)

trace1 = go.Bar(x = observers['Number of Observation'], y = observers['Observer'], orientation='h', name = 'Credit')
fig.add_trace(trace1, row = 1, col = 2)

fig.update_layout(showlegend = False, template = 'seaborn')
fig.show()

The most common Location for observing volcanic Anamolies are Schroters Valley. Lets get some more information about it from Wikipedia. 

<div class="alert alert-success" role="alert">
Schroter's Valley, frequently known by the Latinized name Vallis Schröteri, is a sinuous valley or rille on the surface of the near side of the Moon. It is located on a rise of continental ground, sometimes called the Aristarchus plateau, that is surrounded by the Oceanus Procellarum to the south and west and the Mare Imbrium to the northwest. At the southern edge of this rise are the craters Aristarchus and Herodotus. 
    
The origins of this rille are believed to be volcanic. The interior floor has been resurfaced and is very level. However, there is a slender rille located on the floor, which can be photographed from Earth with a good telescope and good seeing.    
</div>

W.H Pickering was on the 6th Place in the overall number of Anamolies observed. He has 8 Anamolies to his credit. Out of which 7 are Volcanic in nature. 

The only other observation he has been creditted of is :

In [None]:
df[df['Credit'] == 'W.H. Pickering'].iloc[0:1,]

# The Dark Side 

The phrase "dark side of the Moon" does not refer to "dark" as in the absence of light, but rather "dark" as in unknown: until humans were able to send spacecraft around the Moon, this area had never been seen. While many misconstrue this to think that the "dark side" receives little to no sunlight, in reality, both the near and far sides receive (on average) almost equal amounts of light directly from the Sun. 

Source : Wikipedia

In [None]:
darkSide = df[df['Location'] == 'Dark Side']
print(f'There are {darkSide.shape[0]} anamoly observations from the dark side of the moon')

In [None]:
monthlyObservation = darkSide['Month'].value_counts().reset_index()
monthlyObservation.columns = ['Month', 'Number of Observation']
yearlyObservation = darkSide.groupby('Year').size().cumsum().reset_index()
yearlyObservation.columns = ['Year', 'Number of Observation']

fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles=['Dark Side Observations over the Years','Dark Side Observations Each Month'],
                   horizontal_spacing=0.1)

trace0 = go.Scatter(x = yearlyObservation['Year'], y = yearlyObservation['Number of Observation'], mode='lines',name = 'Yearly Observations')
fig.add_trace(trace0, row = 1, col = 1)

trace1 = go.Bar(x = monthlyObservation['Number of Observation'], y = monthlyObservation['Month'], orientation='h', name = 'Monthly Observations')
fig.add_trace(trace1, row = 1, col = 2)

fig.update_layout(showlegend = False, template = 'seaborn')
fig.show()

In [None]:
observers = darkSide['Credit'].value_counts().reset_index()
observers.columns = ['Observer', 'Number of Observation']
fig = px.bar(observers, x = 'Number of Observation', y = 'Observer', title = 'Who saw the dark side?', template = 'plotly_dark')
fig.show()

In [None]:
corpus = ' '.join(darkSide['Description'].fillna('').values)
corpus = corpus.lower()
corpus = re.sub(r'[,.!*):(<]', '', corpus)
corpus = re.sub(r'\s+',' ',corpus)

plt.subplots(figsize=(12,8))
wordcloud = WordCloud(background_color='Black',
                      contour_color='White', contour_width=5, 
                      stopwords=stopwords,
                      width=1500, margin=10,
                      height=1080
                     ).generate(corpus)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()