# Potential Talents - An Apziva Project (#3)

# EDA

By Samuel Alter

Apziva: 6bImatZVlK6DnbEo

## Project Overview

### Goals

### The Dataset

## Imports and Helper Functions

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import joblib
import time
from datetime import datetime
import json
from pathlib import Path
import inspect
import re
import string
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import geopandas as gpd
import duckdb as dd

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords # lists of stopwords
from nltk.tokenize import word_tokenize # tool for splitting documents into tokens
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer # basic stemmer
from nltk.stem import WordNetLemmatizer # more sophisticated word->lemma
from nltk.corpus import wordnet

In [None]:
# simple function to generate random integers

def rand_gen(low=1,high=1e4):
    '''
    Generates a pseudo-random integer
    consisting of up to four digits
    '''
    import numpy as np
    rng=np.random.default_rng()
    random_state=int(rng.integers(low=low,high=high))
    
    return random_state

In [None]:
seed=rand_gen()
seed

In [None]:
# set the randomness seed throughout the notebook
# source: # https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752

## set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed)
## set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed)
## set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed)
np.random.default_rng(seed)

In [None]:
def get_variable_name(var):
    callers_local_vars = inspect.currentframe().f_back.f_locals.items()
    return [name for name, val in callers_local_vars if val is var]

def fileDaterSaver(location: str,
                   filetype: str,
                   object_,
                   extra: str = '',
                   verbose: bool = True):

    '''
    Function that gets a timestamped filename and saves it
    to a user-specified location.

    Parameters:
    -----------
    location: str - The location where the file will be saved.
    filetype: str - The type of the file to save ('csv' or 'json').
    object_: The object to be saved. Should be a pandas DataFrame
        for 'csv' or serializable for 'json'.
    extra: str - Additional string to include in the filename.
    verbose: bool - Whether to print verbose messages.
    '''

    # get current date and time
    current_datetime = datetime.now()

    # print current date and time to check
    if verbose:
        print('current_datetime:', current_datetime)

    # format the datetime for a filename
    datetime_suffix = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

    # create filename with the datetime suffix
    if extra != '':
        file_name = f'{location}{extra}_{datetime_suffix}.{filetype}'
    else:
        file_name = f'{location}{datetime_suffix}.{filetype}'

    # print file name
    if verbose:
        print(file_name)

    # save object
    if filetype == 'csv':
        object_.to_csv(file_name, index=True)
    elif filetype == 'json':
        with open(file_name, 'w') as file:
            file.write(json.dumps(object_, default=str))
    else:
        raise ValueError("Unsupported file type. Use 'csv' or 'json'.")

    # confirm save
    file_path = Path(file_name)
    if file_path.exists():
        variable_name = get_variable_name(object_)
        if variable_name:
            print(f'Successfully saved {variable_name[0]} to {file_path}')
        else:
            print(f'Successfully saved object to {file_path}')
    else:
        print("File save error.")

Read in the datset:

In [None]:
df=pd.read_csv('../data/3_data.csv')
df.head(3)

## Analysis

In [None]:
df.info()

No nulls in the dataset. That is nice for us! There are 104 total observations.

In [None]:
df['fit'].value_counts()

We'll remove `fit` as it is a column with no data.

In [None]:
if 'fit' in df.columns: # flow control for repeat code executions
    df.drop('fit',axis=1,inplace=True)

df.head()

Inspect the connections column:

In [None]:
connections=df['connection'].value_counts()

connections

I will change the "500+" into 500 so that it can remain a numeric value.

In [None]:
# remove all non-numeric characters from the column
column='connection'
if not pd.api.types.is_numeric_dtype(df[column]):
    df[column]=df[column].str.replace(r'\D','',regex=True)

    # check if any '+' characters are still present
    print("Amount of '+' in column:",df[column].str.contains(r'\+').sum())

    # convert to integer
    df[column]=pd.to_numeric(df[column],errors='coerce')
    print(df.dtypes)

    # check if conversion was successful
    print('\nWas conversion successful?')
    if pd.api.types.is_numeric_dtype(df[column]):
        print('Yes.')
    else:
        print('No.')
else:
    print(f'There are no non-numeric characters in the column: {column}')

### Histogram of Connections

In [None]:
# remove the >500 connection observations
df_no500=df[df['connection']<500]

# check
df_no500['connection'].value_counts().head(3)

In [None]:
plt.figure(figsize=(8,6))
plt.hist(x=df_no500['connection'],
         color='cornflowerblue',
         bins=20)
plt.title('Histogram of Connections\n\nNote:\nThose with greater than 500 connections are not shown\nThere are 44 observations with >500 connections')
plt.xlabel('Number of Connections')
plt.ylabel('Count')
plt.grid(which='both',axis='y')
plt.xticks(range(0,501,50)) # get xticks to appear every 50 connections

plt.savefig('figures/histogram_connections.pdf')
plt.savefig('figures/histogram_connections.jpg')
plt.savefig('figures/histogram_connections.png')

plt.show()

### Boxplot of Connections

In [None]:
plt.figure(figsize=(6,2))
sns.boxplot(x=df['connection'],color='cornflowerblue')
plt.title('Boxplot of Connections\nIncluding those with >500 Connections')
plt.xlabel('Connection')
plt.xticks(range(0,501,50))

plt.savefig('figures/boxplot_connections.pdf')
plt.savefig('figures/boxplot_connections.jpg')
plt.savefig('figures/boxplot_connections.png')

plt.show()

In [None]:
plt.figure(figsize=(6,2))
sns.boxplot(x=df_no500['connection'],color='cornflowerblue')
plt.title('Boxplot of Connections\nNot including those with >500 Connections')
plt.xlabel('Connection')
plt.xticks(range(0,501,50))

plt.savefig('figures/boxplot_no500.pdf')
plt.savefig('figures/boxplot_no500.jpg')
plt.savefig('figures/boxplot_no500.png')

plt.show()

### Map of Observations

In [None]:
df['location'].value_counts()

In [None]:
# number of unique locations in dataset
df['location'].nunique()

This is not terrible. I'd like to get the centroids for each municipality to create a chloropleth map of the locations.

First step is to clean this column to make it easier to get the centroids. We won't go fully intense with the geospatial information, so if the city says "Greater CITY Area," I'll just make that the CITY to simplify things.

In [None]:
city_names_map={
    'Kanada':'Canada'
    'Raleigh-Durham, North Carolina Area':
    'Houston, Texas Area'
    'Greater New York City Area'
    'Houston, Texas'
    'Denton, Texas'
    'San Francisco Bay Area'
    'Greater Philadelphia Area'
    'İzmir, Türkiye'
    'Lake Forest, California'
    'Atlanta, Georgia'
    'Chicago, Illinois'
    'Austin, Texas Area'
    'Greater Atlanta Area'
    'Amerika Birleşik Devletleri'
    'Long Beach, California'
    'Milpitas, California'
    'Greater Chicago Area'
    'Torrance, California'
    'Greater Los Angeles Area'
    'Bridgewater, Massachusetts'
    'Lafayette, Indiana'
    'Kokomo, Indiana Area'
    'Las Vegas, Nevada Area'
    'Cape Girardeau, Missouri'
    'Gaithersburg, Maryland'
    'Baltimore, Maryland'
    'Dallas/Fort Worth Area'
    'Highland, California'
    'Los Angeles, California'
    'Chattanooga, Tennessee Area'
    'Myrtle Beach, South Carolina Area'
    'Baton Rouge, Louisiana Area'
    'New York, New York'
    'San Jose, California'
    'Greater Boston Area'
    'Monroe, Louisiana Area'
    'Virginia Beach, Virginia'
    'Greater Grand Rapids, Michigan Area'
    'Jackson, Mississippi Area'
    'Katy, Texas'
}

Then we'll apply the city boundary data to each city, using the data from [this repo](https://github.com/drei01/geojson-world-cities/tree/master).

In [None]:
# install and load spatial extension
dd.execute('INSTALL spatial')
dd.execute('LOAD spatial')

# load cities geojson
rel=dd.read_json('../data/cities.geojson')

# show schema of the "rel" relation
dd.sql('summarize rel').select('column_name','column_type').show()

In [None]:
df[df['connection']<150].count().iloc[0]

In [None]:
df[df['connection']>150].count().iloc[0]

In [None]:
print(f"{df[df['connection']<150].sum()}")

Most observations have more than 500 connections.