In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import datetime
from ipyleaflet import Map, basemaps, Marker, basemap_to_tiles, Circle, Polyline
from eda.data_loaders.csv_loader import CSVDataLoader
from eda.data_analysis.data_analysis import DataAnalysis
from eda.data_cleaning.data_cleaner import DataCleaning

# load the world cities (nasadf) dataset
loader = CSVDataLoader(datasets_dir='datasets')
nasadf = loader.load_data("NASA_Facilities.csv")

# perform data analysis
analysis = DataAnalysis(nasadf)

# perform data cleaning
cleaning = DataCleaning(nasadf)

In [None]:
nasadf.hist(bins=30, figsize=(15,9), color='g')
plt.show()

In [2]:
analysis.initial()

self.df.head(10)
                         Center Center Search Status  \
0          Kennedy Space Center               Public   
1       Langley Research Center               Public   
2          Kennedy Space Center               Public   
3  Marshall Space Flight Center               Public   
4  Marshall Space Flight Center               Public   
5          Kennedy Space Center               Public   
6          Kennedy Space Center               Public   
7            Jet Propulsion Lab               Public   
8            Jet Propulsion Lab               Public   
9          Kennedy Space Center               Public   

                                           Facility  FacilityURL  Occupied  \
0                        Control Room 2/1726/HGR-S           NaN    1957.0   
1             Micometeroid/LDEF Analysis Laboratory          NaN    1965.0   
2     SRM Rotation and Processing Facility/K6-0494           NaN    1984.0   
3        ET WTC - 14-Inch Trisonic Wind Tunnel 4732   

In [None]:
#Â Status is Active & URL exists & Occupied
working_df = nasadf[ (nasadf["Status"] == "Active") &
    (pd.notnull(nasadf["URL Link"])) &
    (pd.notnull(nasadf["Occupied"])) 
    ]

In [None]:
# Unpack/Extract Latitude and Longitude from the Location column
regex_pattern = r"(?P<Latitude>-?\d+\.\d+), (?P<Longitude>-?\d+\.\d+)\)"
df_unpacked = working_df["Location"].str.extract(regex_pattern)

# Merge the unpacked columns back in the main dataframe
nasadf = pd.concat([nasadf, df_unpacked], axis=1)

In [None]:
# Define example central point between Colorado Springs and Kansas City
CENTER_LOC  = ( 39.0119, -98.4842 )
MAP = Map(basemap=basemaps.OpenTopoMap, center=CENTER_LOC, zoom=3)
MAP.add_layer(Marker(location=CENTER_LOC))
display(MAP)

In [None]:
def draw_marker_on_map( amap, location ):
    amap.add_layer(Marker(location=location))

In [None]:
# apply draw circle for each NASA facility location
nasadf.apply(lambda loc: draw_marker_on_map(MAP, (loc['Latitude'], loc['Longitude'])), axis=1)

In [None]:
nasadf

In [None]:
center_counts = nasadf.groupby('Center').size()
center_counts

In [None]:
center_counts.plot(kind='bar', figsize=(10, 6))
plt.title('Number of Facilities by Center')
plt.ylabel('Number of Facilities')
plt.xlabel('Center')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
nasadf.drop("Center Search Status", axis=1, inplace=True)

In [None]:
nasadf

In [None]:
nasadf['Status'].isna().sum()

In [None]:
nasadf['Status'].fillna('Unknown', inplace=True)

In [None]:
status_counts = nasadf['Status'].value_counts()

In [None]:
# Create horizontal bar chart
status_counts.plot(kind='barh')
plt.title('Distribution of Facility Status')
plt.xlabel('Number of Facilities')
plt.show()

In [None]:
nasadf['Record Date'].value_counts()

In [None]:
# convert date columns
nasadf['Record Date'] = pd.to_datetime(nasadf['Record Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
nasadf['Last Update'] = pd.to_datetime(nasadf['Last Update'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

In [None]:
nasadf = nasadf.sort_values(by='Record Date')

In [None]:
nasadf['Year'] = nasadf['Record Date'].dt.year
# Group by Year and count the number of facilities
yearly_counts = nasadf.groupby('Year').size()
yearly_counts.index

In [None]:
# Setting x-ticks to force integer values
yearly_counts.index = yearly_counts.index.astype(int)

# Visualization
plt.figure(figsize=(11, 5))
# s adjusts the size of the dots
plt.scatter(yearly_counts.index, yearly_counts.values, color='blue', s=100)  
plt.xlabel('Year')
plt.ylabel('Number of Facilities')
plt.title('Number of Facilities Recorded Each Year')

# ensure that only integers are used for the x-ticks.
# https://matplotlib.org/stable/gallery/ticks/tick_labels_from_values.html#setting-tick-labels-from-a-list-of-values
ax = plt.gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
state_counts = nasadf['State'].value_counts()

In [None]:
state_counts

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x=state_counts.index, y=state_counts.values, palette='viridis')
plt.xticks(rotation=45)
plt.title('Number of Facilities in Each State')
plt.ylabel('Number of Facilities')
plt.xlabel('State')
plt.tight_layout()
plt.show()

In [None]:
# Group by 'Contact' and count the number of facilities they're linked to
contact_counts = nasadf['Contact'].value_counts()

In [None]:
contact_counts

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(y=contact_counts.index, x=contact_counts.values, orient='h')
plt.xlabel('Number of Facilities')
plt.ylabel('Contact')
plt.title('Number of Facilities per Contact')
plt.show()

In [None]:
# Calculate mean and median establishment year for 'Active' facilities
active_mean_year = nasadf[nasadf['Status'] == 'Active']['Occupied'].mean()
active_median_year = nasadf[nasadf['Status'] == 'Active']['Occupied'].median()

# Calculate mean and median establishment year for 'Inactive' facilities
inactive_mean_year = nasadf[nasadf['Status'] == 'Inactive']['Occupied'].mean()
inactive_median_year = nasadf[nasadf['Status'] == 'Inactive']['Occupied'].median()

print("For Active Facilities:")
print(f"Mean Establishment Year: {active_mean_year:.2f}")
print(f"Median Establishment Year: {active_median_year}")

print("\nFor Inactive Facilities:")
print(f"Mean Establishment Year: {inactive_mean_year:.2f}")
print(f"Median Establishment Year: {inactive_median_year}")

In [None]:
# Filter out rows with NaN in 'Occupied' column
nasadf_filtered = nasadf[nasadf['Occupied'].notna()]

# Convert 'Occupied' to integer
nasadf_filtered['Occupied'] = nasadf_filtered['Occupied'].astype(int)

# Create a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=nasadf_filtered, x='Status', y='Occupied')
plt.title("Distribution of Establishment Years by Status")
plt.ylabel("Establishment Year")
plt.xlabel("Facility Status")
plt.show()

In [None]:
# Determine which centers have the oldest average facility age
# Calculate the current year
current_year = datetime.datetime.now().year

# Calculate age for each facility
nasadf['Age'] = current_year - nasadf['Occupied']

# Group by center and calculate the average age
average_ages = nasadf.groupby('Center')['Age'].quantile('.50').sort_values(ascending=False)
print(average_ages.dropna())