In [1]:
# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
# read the data into pandas data frames
df_feat = pd.read_csv('train_features_2013-03-07.csv',sep=',')
df_target = pd.read_csv('train_salaries_2013-03-07.csv',sep=',')

In [3]:
df_feat.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16


In [4]:
df_target.head()

Unnamed: 0,jobId,salary
0,JOB1362684407687,130
1,JOB1362684407688,101
2,JOB1362684407689,137
3,JOB1362684407690,142
4,JOB1362684407691,163


In [5]:
print df_feat.shape
print df_target.shape

(1000000, 8)
(1000000, 2)


In [7]:
# Join the features and the target data frames
df = df_feat.join(df_target,lsuffix='_feat',rsuffix='_target')

In [8]:
df.shape

(1000000, 10)

In [9]:
df.describe()

Unnamed: 0,yearsExperience,milesFromMetropolis,salary
count,1000000.0,1000000.0,1000000.0
mean,11.992386,49.52926,116.061818
std,7.212391,28.877733,38.717936
min,0.0,0.0,0.0
25%,6.0,25.0,88.0
50%,12.0,50.0,114.0
75%,18.0,75.0,141.0
max,24.0,99.0,301.0


In [32]:
# Plotting the histogram for 'salary'
fig = plt.figure(figsize=(12,12))
ax = sns.distplot(df['salary'])
plt.savefig('salary_hist.png')

In [10]:
# Drop redundant column
df.drop(['jobId_target'],axis=1,inplace=True)
# Rename the column
df.columns = [u'jobId', u'companyId', u'jobType', u'degree', u'major',
       u'industry', u'yearsExperience', u'milesFromMetropolis', u'salary']

In [71]:
# Plotting the histogram for 'yearsExperience'
fig = plt.figure(figsize=(12,12))
ax = sns.distplot(df['yearsExperience'])
plt.savefig('yearsExperience_hist.png')

# Plotting the histogram for 'milesFromMetropolis'
fig = plt.figure(figsize=(12,12))
ax = sns.distplot(df['milesFromMetropolis'])
plt.savefig('milesFromMetropolis_hist.png')

In [11]:
# Get the number of unique companies in the dataset.
print pd.unique(df['companyId'])
print ("Number of unique company IDs: {0}".format(len(pd.unique(df['companyId']))))

['COMP37' 'COMP19' 'COMP52' 'COMP38' 'COMP7' 'COMP15' 'COMP24' 'COMP20'
 'COMP41' 'COMP56' 'COMP4' 'COMP54' 'COMP57' 'COMP14' 'COMP61' 'COMP58'
 'COMP3' 'COMP44' 'COMP30' 'COMP27' 'COMP34' 'COMP11' 'COMP31' 'COMP49'
 'COMP0' 'COMP1' 'COMP36' 'COMP47' 'COMP8' 'COMP42' 'COMP50' 'COMP53'
 'COMP48' 'COMP45' 'COMP46' 'COMP2' 'COMP5' 'COMP55' 'COMP29' 'COMP40'
 'COMP33' 'COMP22' 'COMP12' 'COMP25' 'COMP6' 'COMP23' 'COMP17' 'COMP28'
 'COMP21' 'COMP26' 'COMP43' 'COMP51' 'COMP10' 'COMP59' 'COMP13' 'COMP39'
 'COMP16' 'COMP9' 'COMP32' 'COMP62' 'COMP35' 'COMP18' 'COMP60']
Number of unique company IDs: 63


In [11]:
# histogram plot for companyId
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='companyId',data=df,orient='h')
plt.savefig('companyId_distplot.png')

In [12]:
# Counting the frequency that a value occurs in the df column
# We can see that the values in columns 'jobType'and 'industry' are evenly distributed. 
# There are 47.383% job postings which require NO or a just a high school degree.
# There are 53.2355% job postings which don't require any major.
print df['jobType'].value_counts()
print df['degree'].value_counts()
print df['major'].value_counts()
print df['industry'].value_counts()

SENIOR            125886
VICE_PRESIDENT    125235
MANAGER           125121
CTO               125046
JANITOR           124971
CEO               124778
JUNIOR            124594
CFO               124369
Name: jobType, dtype: int64
HIGH_SCHOOL    236976
NONE           236854
BACHELORS      175495
DOCTORAL       175364
MASTERS        175311
Name: degree, dtype: int64
NONE           532355
CHEMISTRY       58875
LITERATURE      58684
ENGINEERING     58596
BUSINESS        58518
PHYSICS         58410
COMPSCI         58382
BIOLOGY         58379
MATH            57801
Name: major, dtype: int64
WEB          143206
AUTO         142943
FINANCE      142867
EDUCATION    142819
OIL          142771
HEALTH       142755
SERVICE      142639
Name: industry, dtype: int64


In [13]:
# Calculate the percentage job posting based on the value of columns
print df[(df['degree'] == 'NONE') | (df['degree'] == 'HIGH_SCHOOL')].shape[0] * 100 /float(df['degree'].shape[0])
print df[(df['major'] == 'NONE')].shape[0] * 100 / float(df['major'].shape[0])

47.383
53.2355


In [135]:
# regression plot for milesFromMetropolis
fig = plt.figure(figsize=(12,12))
ax = sns.regplot(x='salary',y='milesFromMetropolis',data=df,scatter_kws={"s": 5})
plt.savefig('mfm_salary_regplot.png')

In [88]:
# regression plot for yearsExperience and salary
fig = plt.figure(figsize=(12,12))
ax = sns.regplot(x='salary',y='yearsExperience',data=df)
plt.savefig('ye_salary_regplot.png')

In [25]:
# regression plot for yearsExperience and milesFromMetropolis
fig = plt.figure(figsize=(12,12))
ax = sns.regplot(x='milesFromMetropolis',y='yearsExperience',data=df)
plt.savefig('ye_mfm_regplot.png')

In [12]:
pd.unique(df['yearsExperience'])

array([10,  3,  8,  2, 23,  9,  1, 17, 24,  7, 21, 13, 11, 14, 18, 20, 12,
       15, 22,  0, 19,  5,  4, 16,  6])

In [13]:
df['yearsExperience'].describe()

count    1000000.000000
mean          11.992386
std            7.212391
min            0.000000
25%            6.000000
50%           12.000000
75%           18.000000
max           24.000000
Name: yearsExperience, dtype: float64

In [14]:
# Creating a Category variable out of yearsExperience variable
def f(row):
    if row['yearsExperience'] < 3:
        return 0
    elif row['yearsExperience'] > 3 and row['yearsExperience'] < 10:
        return 1
    elif row['yearsExperience'] >10 and row['yearsExperience'] < 15:
        return 2
    else:
        return 3
    
df['yearsExperienceCategory'] = df.apply(f,axis=1)

In [15]:
df['yearsExperienceCategory'] = df['yearsExperienceCategory'].astype(object)

In [134]:
# Doing regression plot with sub-categories to see how much difference does a degree make to your salary.
fig = plt.figure(figsize=(20,20))
# ax = sns.lmplot(x='salary',y='yearsExperience',data=df,hue='degree',\
#                 scatter_kws={'s':10.0},legend=False,fit_reg=False,\
#                 palette=dict(MASTERS="black", HIGH_SCHOOL="yellow", DOCTORAL="red",\
#                             BACHELORS="gray",NONE="green"))
ax = sns.lmplot(x='salary',y='yearsExperience',data=df,hue='degree',\
                scatter_kws={'s':5.0},legend=False,fit_reg=False,)
plt.legend(loc=2,prop={'size':6})
plt.savefig('ye_salary_degree_lmplot.png')

In [19]:
# Doing regression plot with sub-categories to see how much difference does industry make to your salary.
fig = plt.figure(figsize=(20,20))
ax = sns.lmplot(x='salary',y='yearsExperience',data=df,hue='industry',\
                scatter_kws={'s':5.0},legend=False,fit_reg=False,\
               palette=dict(HEALTH="black", WEB="yellow", AUTO="red",\
                            FINANCE="blue",EDUCATION="green",OIL="purple",SERVICE="orange"))
plt.legend(loc=2,prop={'size':6})
plt.savefig('ye_salary_industry_lmplot.png')

In [24]:
# SENIOR            125886
# VICE_PRESIDENT    125235
# MANAGER           125121
# CTO               125046
# JANITOR           124971
# CEO               124778
# JUNIOR            124594
# CFO               124369
# Doing regression plot with sub-categories to see how much difference does jobType make to your salary.
fig = plt.figure(figsize=(20,20))
ax = sns.lmplot(x='salary',y='yearsExperience',data=df,hue='jobType',\
                scatter_kws={'s':5.0},legend=False,fit_reg=False,\
               palette=dict(SENIOR="white", VICE_PRESIDENT="yellow", MANAGER="red",\
                            CTO="green",JANITOR="blue",CEO="purple",JUNIOR="orange",\
                           CFO='pink'))
plt.legend(loc=2,prop={'size':6})
plt.savefig('ye_salary_jobType_lmplot.png')

In [23]:
# NONE           532355
# CHEMISTRY       58875
# LITERATURE      58684
# ENGINEERING     58596
# BUSINESS        58518
# PHYSICS         58410
# COMPSCI         58382
# BIOLOGY         58379
# MATH            57801
# Doing regression plot with sub-categories to see how much difference does major make to your salary.
fig = plt.figure(figsize=(20,20))
ax = sns.lmplot(x='salary',y='yearsExperience',data=df,hue='major',\
                scatter_kws={'s':5.0},legend=False,fit_reg=False,\
               palette=dict(NONE="pink", CHEMISTRY="yellow", LITERATURE="red",\
                            ENGINEERING="blue",BUSINESS="green",PHYSICS="purple",COMPSCI="orange",\
                           BIOLOGY='gold',MATH='brown'))
plt.legend(loc=2,prop={'size':6})
plt.savefig('ye_salary_major_lmplot.png')

In [137]:
# Boxplot for salary based on the jobType
# The results are like expected. The higher is the position, more is the salary.
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='jobType',data=df,orient='h')
plt.savefig('jobtype_salary_boxplot.png')

# Boxplot for salary based on the major required for the job.
# The salaries for the positions requiring engineering major is the highest.
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='major',data=df,orient='h')
plt.savefig('major_salary_boxplot.png')

# Boxplot of salary based on degrees.
# We can see that degree makes a huge difference. In case of Degree = NONE or HIGH_SCHOOL, only the outliers
# cross the 200K range. 
# There is not much difference between BACHELORS and MASTERS however, salaries for DOCTORAL are slightly higher.
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='degree',data=df,orient='h')
plt.savefig('degree_salary_boxplot.png')

# Boxplot of the salary based on the industry which the job requires.
# The range of salaries in the OIL and FINANCE industry is higher than the rest.
# In education/academia its the least.
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='industry',data=df,orient='h')
plt.savefig('industry_salary_boxplot.png')

# Boxplot for salary and yearsExperienceCategory
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',y='yearsExperienceCategory',data=df,orient='h')
ax.set_yticklabels(['0-3 yrs','4-9 yrs','10-15 yrs','Above 15 yrs'])
plt.savefig('yec_salary_boxplot.png')

In [16]:
# Calculating pearson correlation between yearsExperience, milesFromMetropolis and salary.
# There is not mild positive linear realtionship between yearsExperience and salary & years
# mild negative linear relationship between milesFromMetropolis and salary.
corr = df.corr(method='pearson')
corr

Unnamed: 0,yearsExperience,milesFromMetropolis,salary
yearsExperience,1.0,0.000673,0.375013
milesFromMetropolis,0.000673,1.0,-0.297666
salary,0.375013,-0.297666,1.0


In [140]:
# Boxplot for 'salary'.
# The median value is almost equal the mean value. There seem to be a lot of outliers at the top of the distribution.
# If we ignore the outliers, the range of salaries (between the opposite ends of the whiskers) is about 200K.
fig = plt.figure(figsize=(12,12))
ax = sns.boxplot(x='salary',data=df,orient='h',showmeans=True)
plt.savefig('salary_boxplot.png')

In [17]:
df.columns

Index([                  u'jobId',               u'companyId',
                       u'jobType',                  u'degree',
                         u'major',                u'industry',
               u'yearsExperience',     u'milesFromMetropolis',
                        u'salary', u'yearsExperienceCategory'],
      dtype='object')

In [17]:
# Change the data type of columns
df['jobType'] = df['jobType'].astype('category')
df['major'] = df['major'].astype('category')
df['degree'] = df['degree'].astype('category')
df['industry'] = df['industry'].astype('category')
df['yearsExperienceCategory'] = df['yearsExperienceCategory'].astype('category')
df['companyId'] = df['companyId'].astype('category')

In [18]:
df.dtypes

jobId                        object
companyId                    object
jobType                    category
degree                     category
major                      category
industry                   category
yearsExperience               int64
milesFromMetropolis           int64
salary                        int64
yearsExperienceCategory    category
dtype: object

In [30]:
# Convert the categorical varibles into numerical codes
df['degreeCoded'] = df['degree'].cat.codes
df['majorCoded'] = df['major'].cat.codes
df['industryCoded'] = df['industry'].cat.codes
df['jobTypeCoded'] = df['jobType'].cat.codes
df['companyIdCoded'] = df['companyId'].cat.codes

In [23]:
# Clustered bar plots for ('major' and'degree')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='degree',data=df,hue='major')
plt.savefig('degree_major_countplot.png')

# Clustered bar plots for ('industry' and'degree')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='degree',data=df,hue='industry')
plt.savefig('degree_industry_countplot.png')

# Clustered bar plots for ('jobType' and'degree')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='degree',data=df,hue='jobType')
plt.savefig('degree_jobType_countplot.png')

# Clustered bar plots for ('jobType' and'major')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='major',data=df,hue='jobType')
plt.savefig('major_jobType_countplot.png')

# Clustered bar plots for ('major' and'industry')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='major',data=df,hue='industry')
plt.savefig('major_industry_countplot.png')

# Clustered bar plots for ('jobType' and'industry')
fig = plt.figure(figsize=(12,12))
ax = sns.countplot(x='jobType',data=df,hue='industry')
plt.savefig('jobType_industry_countplot.png')


In [27]:
# Clustered bar plots for ('companyId' and'industry')
fig = plt.figure(figsize=(60,12))
ax = sns.countplot(x='companyId',data=df,hue='industry')
plt.savefig('companyId_industry_countplot.png')

# Clustered bar plots for ('companyId' and'jobType')
fig = plt.figure(figsize=(60,12))
ax = sns.countplot(x='companyId',data=df,hue='jobType')
plt.savefig('companyId_jobType_countplot.png')

# Clustered bar plots for ('companyId' and'major')
fig = plt.figure(figsize=(60,12))
ax = sns.countplot(x='companyId',data=df,hue='major')
plt.savefig('companyId_major_countplot.png')

# Clustered bar plots for ('companyId' and'degree')
fig = plt.figure(figsize=(60,12))
ax = sns.countplot(x='companyId',data=df,hue='degree')
plt.savefig('companyId_degree_countplot.png')




In [32]:
# Performing the chi squared test for independence for the categorical columns in df.
# Check their p values for determining if the columns are independent or not.
from scipy import stats as scs

def categories(series):
    return range(int(series.min()), int(series.max()) + 1)

def chi_square_of_df_cols(df, col1, col2):
    df_col1, df_col2 = df[col1], df[col2]

    result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
               for cat2 in categories(df_col2)]
              for cat1 in categories(df_col1)]

    return scs.chi2_contingency(result)

In [43]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.0 < 0.05 which means that degree and major are not independent.
print chi_square_of_df_cols(df, 'degreeCoded', 'majorCoded')

(791087.53637181141, 0.0, 32, array([[  10245.222605,   10269.61641 ,   10332.268125,   10245.74909 ,
          10283.30502 ,   10298.74858 ,   10143.786495,   93425.640725,
          10250.66295 ],
       [  10237.574956,   10261.950552,   10324.5555  ,   10238.101048,
          10275.628944,   10291.060976,   10136.214564,   93355.90222 ,
          10243.01124 ],
       [  13834.421904,   13867.361568,   13951.962   ,   13835.132832,
          13885.845696,   13906.699584,   13697.449776,  126155.35848 ,
          13841.76816 ],
       [  10234.480869,   10258.849098,   10321.435125,   10235.006802,
          10272.523356,   10287.950724,   10133.151111,   93327.687405,
          10239.91551 ],
       [  13827.299666,   13860.222372,   13944.77925 ,   13828.010228,
          13878.696984,   13899.540136,   13690.398054,  126090.41117 ,
          13834.64214 ]]))


In [44]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.44 > 0.05 which means that degree and industry are independent.
print chi_square_of_df_cols(df, 'degreeCoded', 'industryCoded')

(24.270671904949687, 0.44620975414402414, 24, array([[ 25085.781785,  25064.020405,  25072.444165,  25052.788725,
         25055.596645,  25032.431305,  25131.93697 ],
       [ 25067.056252,  25045.311116,  25053.728588,  25034.08782 ,
         25036.893644,  25013.745596,  25113.176984],
       [ 33874.060368,  33844.675344,  33856.050192,  33829.50888 ,
         33833.300496,  33802.019664,  33936.385056],
       [ 25059.480273,  25037.741709,  25046.156637,  25026.521805,
         25029.326781,  25006.185729,  25105.587066],
       [ 33856.621322,  33827.251426,  33838.620418,  33812.09277 ,
         33815.882434,  33784.617706,  33918.913924]]))


In [45]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.0 < 0.05 which means that degree and jobType are not independent.
print chi_square_of_df_cols(df, 'degreeCoded', 'jobTypeCoded')

(158620.81447095107, 0.0, 28, array([[ 21897.91511 ,  21826.137655,  21944.94777 ,  21931.785645,
         21865.62403 ,  21958.109895,  22092.36357 ,  21978.116325],
       [ 21881.569192,  21809.845316,  21928.566744,  21915.414444,
         21849.302216,  21941.719044,  22075.872504,  21961.71054 ],
       [ 29569.391328,  29472.468144,  29632.900896,  29615.127696,
         29525.787744,  29650.674096,  29831.960736,  29677.68936 ],
       [ 21874.955958,  21803.253759,  21921.939306,  21908.790981,
         21842.698734,  21935.087631,  22069.200546,  21955.073085],
       [ 29554.168412,  29457.295126,  29617.645284,  29599.881234,
         29510.587276,  29635.409334,  29816.602644,  29662.41069 ]]))


In [46]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.0 < 0.05 which means that major and jobType are not independent.
print chi_square_of_df_cols(df, 'majorCoded', 'jobTypeCoded')

(125514.70618848808, 0.0, 56, array([[  7284.414862,   7260.537851,   7300.060434,   7295.682009,
          7273.673126,   7304.438859,   7349.098794,   7311.094065],
       [  7301.759004,   7277.825142,   7317.441828,   7313.052978,
          7290.991692,   7321.830678,   7366.596948,   7328.50173 ],
       [  7346.30475 ,   7322.224875,   7362.08325 ,   7357.667625,
          7335.47175 ,   7366.498875,   7411.53825 ,   7373.210625],
       [  7284.789196,   7260.910958,   7300.435572,   7296.056922,
          7274.046908,   7304.814222,   7349.476452,   7311.46977 ],
       [  7311.491688,   7287.525924,   7327.195416,   7322.800716,
          7300.710024,   7331.590116,   7376.416056,   7338.27006 ],
       [  7322.472152,   7298.470396,   7338.199464,   7333.798164,
          7311.674296,   7342.600764,   7387.494024,   7349.29074 ],
       [  7212.293178,   7188.652569,   7227.783846,   7223.448771,
          7201.657794,   7232.118921,   7276.336686,   7238.708235],
       [ 66

In [47]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.655 > 0.05 which means that major and industry are independent.
print chi_square_of_df_cols(df, 'majorCoded', 'industryCoded')

(43.55662241845252, 0.65529488956945148, 48, array([[  8344.869397,   8337.630401,   8340.432593,   8333.894145,
          8334.828209,   8327.122181,   8360.223074],
       [  8364.738474,   8357.482242,   8360.291106,   8353.73709 ,
          8354.673378,   8346.949002,   8380.128708],
       [  8415.769125,   8408.468625,   8411.294625,   8404.700625,
          8405.642625,   8397.871125,   8431.25325 ],
       [  8345.298226,   8338.058858,   8340.861194,   8334.32241 ,
          8335.256522,   8327.550098,   8360.652692],
       [  8375.888028,   8368.622124,   8371.434732,   8364.87198 ,
          8365.809516,   8358.074844,   8391.298776],
       [  8388.467012,   8381.190196,   8384.007028,   8377.43442 ,
          8378.373364,   8370.627076,   8403.900904],
       [  8262.248343,   8255.081019,   8257.855467,   8251.381755,
          8252.306571,   8244.676839,   8277.450006],
       [ 76096.420765,  76030.408745,  76055.961785,  75996.338025,
         76004.855705,  75934.584

In [48]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.266 > 0.05 which means that industry and jobType are independent.
print chi_square_of_df_cols(df, 'jobTypeCoded', 'industryCoded')

(47.265180908558605, 0.2662543916255608, 42, array([[ 17836.141654,  17820.669182,  17826.658526,  17812.68339 ,
         17814.679838,  17798.209142,  17868.958268],
       [ 17777.677967,  17762.256211,  17768.225923,  17754.296595,
         17756.286499,  17739.869791,  17810.387014],
       [ 17874.450378,  17858.944674,  17864.946882,  17850.94173 ,
         17852.942466,  17836.436394,  17907.337476],
       [ 17863.729653,  17848.233249,  17854.231857,  17840.235105,
         17842.234641,  17825.738469,  17896.597026],
       [ 17809.840142,  17794.390486,  17800.370998,  17786.41647 ,
         17788.409974,  17771.963566,  17842.608364],
       [ 17885.171103,  17869.656099,  17875.661907,  17861.648355,
         17863.650291,  17847.134319,  17918.077926],
       [ 17994.522498,  17978.912634,  17984.955162,  17970.85593 ,
         17972.870106,  17956.253154,  18027.630516],
       [ 17901.466605,  17885.937465,  17891.948745,  17877.922425,
         17879.926185,  17863.395

In [34]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is  0.128 > 0.05 which means that industry and companyId are independent.
print chi_square_of_df_cols(df, 'companyIdCoded', 'industryCoded')

(403.017003121779, 0.12897152977894466, 372, array([[ 2239.345038,  2237.402454,  2238.154422,  2236.39983 ,
         2236.650486,  2234.582574,  2243.465196],
       [ 2261.35826 ,  2259.39658 ,  2260.15594 ,  2258.3841  ,
         2258.63722 ,  2256.54898 ,  2265.51892 ],
       [ 2274.080187,  2272.107471,  2272.871103,  2271.089295,
         2271.343839,  2269.243851,  2278.264254],
       [ 2268.791296,  2266.823168,  2267.585024,  2265.80736 ,
         2266.061312,  2263.966208,  2272.965632],
       [ 2256.069369,  2254.112277,  2254.869861,  2253.102165,
         2253.354693,  2251.271337,  2260.220298],
       [ 2280.798508,  2278.819964,  2279.585852,  2277.79878 ,
         2278.054076,  2275.947884,  2284.994936],
       [ 2273.937244,  2271.964652,  2272.728236,  2270.94654 ,
         2271.201068,  2269.101212,  2278.121048],
       [ 2272.936643,  2270.964919,  2271.728167,  2269.947255,
         2270.201671,  2268.102739,  2277.118606],
       [ 2256.069369,  2254.112277,

In [35]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.2997 > 0.05 which means that major and companyId are independent.
print chi_square_of_df_cols(df, 'companyIdCoded', 'majorCoded')

(512.04481815501072, 0.29974150437123448, 496, array([[  914.565414,   916.742988,   922.33575 ,   914.612412,
          917.964936,   919.343544,   905.510466,  8339.87343 ,   915.05106 ],
       [  923.55578 ,   925.75476 ,   931.4025  ,   923.60324 ,
          926.98872 ,   928.38088 ,   914.41182 ,  8421.8561  ,   924.0462  ],
       [  928.751511,   930.962862,   936.642375,   928.799238,
          932.203764,   933.603756,   919.556109,  8469.235695,   929.24469 ],
       [  926.591488,   928.797696,   934.464   ,   926.639104,
          930.035712,   931.432448,   917.417472,  8449.53856 ,   927.08352 ],
       [  921.395757,   923.589594,   929.224125,   921.443106,
          924.820668,   926.209572,   912.273183,  8402.158965,   921.88503 ],
       [  931.495324,   933.713208,   939.4095  ,   931.543192,
          934.957776,   936.361904,   922.272756,  8494.25638 ,   931.98996 ],
       [  928.693132,   930.904344,   936.5835  ,   928.740856,
          932.145168,   933.545

In [36]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.6988 > 0.05 which means that degree and companyId are independent.
print chi_square_of_df_cols(df, 'companyIdCoded', 'degreeCoded')

(235.92972640838522, 0.69880239244370124, 248, array([[ 2749.30467 ,  2747.252424,  3712.466016,  2746.422126,
         3710.554764],
       [ 2776.3309  ,  2774.25848 ,  3748.96032 ,  2773.42002 ,  3747.03028 ],
       [ 2791.949955,  2789.865876,  3770.051184,  2789.022699,
         3768.110286],
       [ 2785.45664 ,  2783.377408,  3761.283072,  2782.536192,
         3759.346688],
       [ 2769.837585,  2767.770012,  3740.192208,  2766.933513,
         3738.266682],
       [ 2800.19822 ,  2798.107984,  3781.189056,  2797.262316,
         3779.242424],
       [ 2791.77446 ,  2789.690512,  3769.814208,  2788.847388,
         3767.873432],
       [ 2790.545995,  2788.462964,  3768.155376,  2787.620211,
         3766.215454],
       [ 2769.837585,  2767.770012,  3740.192208,  2766.933513,
         3738.266682],
       [ 2757.903925,  2755.84526 ,  3724.07784 ,  2755.012365,  3722.16061 ],
       [ 2800.373715,  2798.283348,  3781.426032,  2797.437627,
         3779.479278],
       [ 280

In [37]:
# The values returned by the function are (chi square value, p-value, degree of freedom and the contigency table)
# The p value is 0.5605 > 0.05 which means that degree and jobType are independent.
print chi_square_of_df_cols(df, 'companyIdCoded', 'jobTypeCoded')

(428.86734982297691, 0.56050592083977657, 434, array([[ 1954.772148,  1948.364754,  1958.970636,  1957.795686,
         1951.889604,  1960.145586,  1972.130076,  1961.93151 ],
       [ 1973.98796 ,  1967.51758 ,  1978.22772 ,  1977.04122 ,
         1971.07708 ,  1979.41422 ,  1991.51652 ,  1981.2177  ],
       [ 1985.093202,  1978.586421,  1989.356814,  1988.163639,
         1982.165946,  1990.549989,  2002.720374,  1992.363615],
       [ 1980.476416,  1973.984768,  1984.730112,  1983.539712,
         1977.555968,  1985.920512,  1998.062592,  1987.72992 ],
       [ 1969.371174,  1962.915927,  1973.601018,  1972.417293,
         1966.467102,  1974.784743,  1986.858738,  1976.584005],
       [ 1990.957768,  1984.431764,  1995.233976,  1994.037276,
         1988.021864,  1996.430676,  2008.637016,  1998.24966 ],
       [ 1984.968424,  1978.462052,  1989.231768,  1988.038668,
         1982.041352,  1990.424868,  2002.594488,  1992.23838 ],
       [ 1984.094978,  1977.591469,  1988.356446, 