In [69]:
import pandas as pd
import numpy as np
from plotly import graph_objects as go
from plotly import express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')


# What Makes A Hobbiest Developer

DataCamp.com is an online learning platform that teaches data science, data analytics and data engineering. They have found that hobbyist developers are more likely to sign up for paid verisons of their account. Their marketing team would like to know how they can get more conversions through their campaigns and find it practical to target hobbyist developers. Its the job of the Data Science team to figure out what are key indicators of a developer being a hobbyist. In this notebook I will go through what I have learned about EDA throughout my time as a data scientist. I will cover not only how to properly perform exploratory data analysis, but tips and tricks that I have picked up that can help improve your eda process.   

## The first tip is that the best exploratory data anaysis uses a model as a guide.

In [70]:
df = pd.read_csv('data/cleaned.csv')

In [71]:
df['Hobbyist'] = df['Hobbyist'].map({'Yes': 1, 'No': 0})

In [72]:
def filter_large(x):
    if x > 10**9:
        return np.nan
    else: return x

In [73]:
df['CompTotal'] = df['CompTotal'].apply(lambda x: filter_large(x))

In [74]:
X = df.drop('Hobbyist', axis=1)
y = df['Hobbyist']

In [75]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [76]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [77]:
_ = pipeline.fit(x_train, y_train)

In [78]:
feature_importances = list(zip(x_train.columns, pipeline['classifier'].coef_.flatten()))

In [79]:
sorted_importances = sorted(feature_importances, key=lambda x: np.abs(x[1]), reverse=True)

In [81]:
top10 = sorted_importances[:10]

In [82]:
top10

[('YearsCodePro_isNaN', 0.3319055534725019),
 ('CompTotal_isNaN', 0.22271584803196362),
 ('Gender_is_Man', 0.21952824062415005),
 ('Age1stCode_isNaN', -0.21829288407836148),
 ('NEWLearn_Once every few years', -0.20903042189701704),
 ('YearsCodePro', -0.20113141276152552),
 ('ConvertedComp_isNaN', -0.19889551166545252),
 ('JobSeek_isNaN', 0.19104640575244272),
 ('LanguageWorkedWith_is_Rust', 0.18753355893358706),
 ('NEWOtherComms_Yes', 0.18076449684911963)]