### Import Modules

In [1]:
import scipy
import sklearn.metrics as metrics
import sklearn.preprocessing as pp
import sklearn.ensemble as ensemble
import sklearn.cluster as cluster
import sklearn.linear_model as linear_model

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import sklearn.tree as tree

import pandas as pd
import numpy as np


In [2]:
pd.options.display.max_columns = 61

### Load the Data and view

In [3]:
schema = pd.read_csv("Data/Survey_results_schema.csv")

In [4]:
schema.set_index('Column', inplace=True)

In [5]:
schema.loc['NEWOtherComms'].QuestionText

'Are you a member of any other online developer communities?'

In [6]:
df = pd.read_csv("Data/Survey_results_public.csv")

In [7]:
df.set_index("Respondent", inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64461 entries, 1 to 65112
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MainBranch                    64162 non-null  object 
 1   Hobbyist                      64416 non-null  object 
 2   Age                           45446 non-null  float64
 3   Age1stCode                    57900 non-null  object 
 4   CompFreq                      40069 non-null  object 
 5   CompTotal                     34826 non-null  float64
 6   ConvertedComp                 34756 non-null  float64
 7   Country                       64072 non-null  object 
 8   CurrencyDesc                  45472 non-null  object 
 9   CurrencySymbol                45472 non-null  object 
 10  DatabaseDesireNextYear        44070 non-null  object 
 11  DatabaseWorkedWith            49537 non-null  object 
 12  DevType                       49370 non-null  object 
 13  E

# Data Cleaning

In [9]:
# make hobbyist data frame; drop columns where hobbyists 
hobbyist_df = df.dropna(subset=["Hobbyist"]) 

In [10]:
#see numeric variables
hobbyist_df.describe()

Unnamed: 0,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,45446.0,34826.0,34756.0,41151.0
mean,30.834111,3.190464e+242,103756.1,40.782174
std,9.585392,inf,226885.3,17.816383
min,1.0,0.0,0.0,1.0
25%,24.0,20000.0,24648.0,40.0
50%,29.0,63000.0,54049.0,40.0
75%,35.0,125000.0,95000.0,44.0
max,279.0,1.1111110000000001e+247,2000000.0,475.0


In [11]:
#convert string types that should be numeric to numeric types
to_numeric = ['Age1stCode', 'YearsCode', 'YearsCodePro']

In [12]:
#Warning with replace on one of my columns

In [13]:
hobbyist_df.replace({'Younger than 5 years' : '5', 'Older than 85': '85'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [14]:
hobbyist_df.replace({'Less than 1 year' : '0', 'More than 50 years': '50'}, inplace=True)

In [15]:
hobbyist_df.replace({'Less than 1 year' : '0', 'More than 50 years': '50'}, inplace=True)

## making Numeric columns

In [16]:
hobbyist_df.loc[:,"Age1stCode"] = pd.to_numeric(hobbyist_df.Age1stCode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [17]:
hobbyist_df.loc[:,"YearsCode"] = pd.to_numeric(hobbyist_df.YearsCode)
hobbyist_df.loc[:,"YearsCodePro"] = pd.to_numeric(hobbyist_df.YearsCodePro)

In [18]:

hobbyist_df.WorkWeekHrs.apply(lambda x: x if x <= 150 else np.nan)

Respondent
1        50.0
2         NaN
3         NaN
4        40.0
5         NaN
         ... 
64858     NaN
64867     NaN
64898     NaN
64925     NaN
65112     NaN
Name: WorkWeekHrs, Length: 64416, dtype: float64

In [19]:
#Create IsNa for all numeric columns

In [20]:
numeric_columns = hobbyist_df.select_dtypes(include=["int", "float"]).columns
numeric_columns

Index(['Age', 'Age1stCode', 'CompTotal', 'ConvertedComp', 'WorkWeekHrs',
       'YearsCode', 'YearsCodePro'],
      dtype='object')

In [21]:
hobbyist_df.WorkWeekHrs.isna()

Respondent
1        False
2         True
3         True
4        False
5         True
         ...  
64858     True
64867     True
64898     True
64925     True
65112     True
Name: WorkWeekHrs, Length: 64416, dtype: bool

In [22]:
def create_isNa(columns, dFrame):
    for i in columns:
        label = i + "_isNaN"
        series = dFrame[i].isna()
        dFrame.loc[:,label] = series.apply(lambda x: 1 if x else 0)
        

In [23]:
create_isNa(numeric_columns, hobbyist_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


### Creating Dummy Variables out of Categorical Columns

In [24]:
cat_columns = hobbyist_df.select_dtypes(include=["object"]).columns
cat_columns

Index(['MainBranch', 'Hobbyist', 'CompFreq', 'Country', 'CurrencyDesc',
       'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
       'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
       'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
       'MiscTechDesireNextYear', 'MiscTechWorkedWith',
       'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
       'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
       'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
       'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
       'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
       'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
       'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
       'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
       'WebframeWorkedWith', 'WelcomeChange'],
      dtype='object')

### Find categorical columns with more than X categories

In [25]:
def find_many_categories(columns, df, minimum, critical):
    
    #make dictionary with the usefull columns and the not usefull columns
    useful_columns = dict()
    not_useful_columns = dict()
    
    for i in columns:
        
        count = len(df[i].unique())
        if minimum:
            if count > critical:
                useful_columns[i] = count
            else:
                not_useful_columns[i] = count
                
                
        else:
            if count < critical:
                useful_columns[i] = count
            else:
                not_useful_columns[i] = count
    
    
    useful_df = df.loc[:,useful_columns.keys()]
    non_useful_df = df.loc[:,not_useful_columns.keys()]
        
    
    return (useful_df, non_useful_df)

In [26]:
hobbyist_ready_df, hobbyist_not_ready = find_many_categories(cat_columns, hobbyist_df, False, 8)
hobbyist_ready_df = find_many_categories(cat_columns, hobbyist_df, False, 8)[0]

In [27]:
hobbyist_df_ready = pd.concat([hobbyist_ready_df, hobbyist_df.select_dtypes(include=["int", "float"])], axis=1)

### Lets Get the variables in hobbyist_not_ready prepared for get_dummies method

In [71]:
hobbyist_not_ready.drop(columns=["CurrencyDesc", "CurrencySymbol"], inplace=True)

KeyError: "['CurrencyDesc' 'CurrencySymbol'] not found in axis"

In [78]:

list_cols = []
non_null = hobbyist_not_ready.dropna()

for i in hobbyist_not_ready.columns:
    pass
    #grab first nonNull entry
    if ";" in non_null.loc[10,i]:
        list_cols.append(i)

for i in list_cols:
    hobbyist_not_ready.loc[:,i] = hobbyist_not_ready[i].str.split(";")

dictionary = hobbyist_not_ready.to_dict()

def get_unique(diction: dict) -> list:
    unique_list = set()
    for i,v in diction.items():
        if type(v) == list:
            for j in v:
                unique_list.add(j)
        else:
            print("na")
    return list(unique_list)

#Loop through list of columns with ; dilimeter
unique_values_dict = dict()
for i in list_cols: 
    unique_values_dict[i] = get_unique(dictionary[i])
print("hidden cell")

hidden cell


In [69]:
len(unique_values_dict[list_cols[14]])

16

In [33]:
x = pd.get_dummies(hobbyist_df_ready.drop(columns="Hobbyist"), drop_first=True)
y = hobbyist_df_ready["Hobbyist"]

### Feature Selection to choose which Variables we want to explore

In [34]:
from sklearn.impute import SimpleImputer

In [35]:
# lets use select K best to find which features we want to choose
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.pipeline import make_pipeline

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [37]:
pipeline = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='mean'),
    pp.StandardScaler())

In [38]:
x_new = pipeline.fit_transform(x_train)

  sqr = np.multiply(arr, arr, out=arr)


In [39]:
x_new.shape

(48312, 83)

In [40]:
feature_selection = SelectKBest(f_classif, k=15)

In [41]:
x_feature_selection = feature_selection.fit(x_new, y_train)

  f = msb / msw


In [42]:
transformed = feature_selection.transform(x_new)

In [43]:
K_best_list = list(x_feature_selection.scores_)
K_best_array = x_feature_selection.scores_
print(len(K_best_list))

83


In [44]:
#Make a data frame with the score in a column called s_classif

In [45]:
K_best_s = pd.Series(K_best_array)

In [46]:
cols = pd.Series(x.columns)

In [47]:
F_classif_df = pd.concat([cols,K_best_s], axis=1)

In [48]:
F_classif_df.columns = ['Column_name', 'f_score']

In [49]:
F_classif_df.sort_values(by="f_score", ascending=False).head(15)

Unnamed: 0,Column_name,f_score
1,Age1stCode,596.705294
43,NEWOtherComms_Yes,519.760725
16,MainBranch_I code primarily as a hobby,516.763423
38,NEWLearn_Once every few years,515.260587
0,Age,258.039551
54,PurchaseWhat_I have little or no influence,209.474271
13,YearsCodePro_isNaN,181.409808
51,OpSys_Linux-based,169.308702
36,NEWLearn_Once a decade,157.351017
57,SOAccount_Yes,141.835533


### EDA
