# What is the difference between a Data Scientist and a Data Analyst?

As our online and in-person data scientist programs are maturing there is interest from both the Product and Curriculum teams on having data-driven insights on the data science job market to make sure that the Flatiron school's offerings and marketing best fit the evolving data science job market our graduates are heading into. 

The first step in this exploration is to try and get more clear definitions as to the difference between a data scientist and a data analyst. It is well known in the data science community that the understanding of the field among the general market, hiring managers, and HR recruiters is quite variable and there is significant overlap between roles that have widely varying titles. To try and get some clarity we have scraped a dataset from LinkedIn covering data scientist and data analyst roles and filtered it for roles in NYC, Atlanta, and Kansas City, MO aiming to incorporate listings from a major tech hub, a developing tech hub, and a non-technical job market so as to be representative of the variety of job markets our graduates will be entering. 

In [1]:
#library imports - obtain, scrub, explore
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

OBTAIN

In [2]:
df = pd.read_csv('Data/jobs.csv')
df = pd.DataFrame(df)
df.head()

Unnamed: 0,company,description,job_cat,loc,location,position,position_low
0,Loftium,About the role\nMachine learning is core to ou...,machine learning,"Seattle, WA","Seattle, Washington, United States",Machine Learning Engineer,
1,Zume Inc.,Who We Are\n\nZume is on a quest to be the mos...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer - Platform,
2,"TRC Staffing Services, Inc.",The goal is to lead the processes from infrast...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,
3,bidco,We are looking for a Machine Learning Engineer...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,
4,Compass /,Engineering\n\nMachine Learning Engineer\n\nSe...,machine learning,"Seattle, WA","Seattle, WA, US",Machine Learning Engineer,


In [3]:
df = df.drop(['location','position_low'], axis=1)

In [4]:
locations = ['NY', 'KCMO', 'ATL']
roles = ['data scientist', 'data analyst']
# df['loc'].unique()
df_filteredbyloc = df[df['loc'].isin(locations)]
df_filteredbyloc = df_filteredbyloc[df_filteredbyloc['job_cat'].isin(roles)].reset_index()
df_filteredbyloc.head()

Unnamed: 0,index,company,description,job_cat,loc,position
0,2078,PRI Technology,Sr. Data Scientist\n\nThe Sr. Data Scientist i...,data scientist,ATL,Sr. Data Scientist
1,2079,StevenDouglas,POSITION SUMMARY:\nThe Data Scientist provides...,data scientist,ATL,Data Scientist
2,2080,Experience LLC,When your team hits a game winner or the band ...,data scientist,ATL,Data Scientist
3,2081,Arby's,Purpose Of The Position\n\nAn Inspire data sci...,data scientist,ATL,Data Scientist
4,2082,Collabera Inc.,"Atlanta, Georgia\nSkills : python ,r ,scala ,j...",data scientist,ATL,Data Scientist


DATA CLEANING - NLP PREP

In [5]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, Doc2Vec, TfidfModel
from gensim.models.phrases import Phraser, Phrases

[nltk_data] Downloading package punkt to /Users/blewis2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
stopword_set = stopwords.words('english')
stopword_set = stopword_set + ["need", "goal", "include", "looking", "seeking"] + list(string.punctuation)

In [7]:
def cleaner(text, stopwords=stopwords.words('english'), symbols = "?&()*%$#@.!:;^"):
    text = text.replace("\n", " ").replace("*", "").strip().strip("\n").replace("-", "_")
    for symbol in symbols:
        text = text.replace(symbol, "")
    text = text.lower()
    new_text = []
    for word in text.split(" "):
        if word in stopwords or word == ' ' or word == '':
            continue
        else:
            new_text.append(word)
    return new_text

In [8]:
descriptions = df_filteredbyloc.description 
descriptions = [cleaner(d, stopwords=stopword_set) for d in descriptions]
df_filteredbyloc['descriptions'] = descriptions

In [9]:
bigram = Phrases(sentences = descriptions, min_count=1, threshold=1)
phrased_descriptions = [bigram[d] for d in descriptions]
df_filteredbyloc['phrased_descriptions'] = phrased_descriptions
df_filteredbyloc['cleaned_phrased_descriptions'] = [' '.join(d) for d in df_filteredbyloc.phrased_descriptions]

In [10]:
y = df_filteredbyloc['job_cat']
X = df_filteredbyloc['cleaned_phrased_descriptions']

In [11]:
#vectorize data
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# tokenize and build vocab
# count_vectorizer.fit(X.description)
# count_vector = count_vectorizer.transform(X.description)
# # summarize encoded vector
# print(count_vector.shape)
# print(type(count_vector))
# # print(vector.toarray())
# vectorizer.fit_transform(df_filteredbyloc)
# vectorizer.vocabulary_
X_t = tfidf_vectorizer.fit_transform(X)
X_t.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04009723, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_t.toarray(), y, test_size=0.2,
                                                    random_state=0)

SCRUB

In [None]:
df_filteredbyloc.sum().isna()

EXPLORE

In [None]:
df_filteredbyloc.groupby('loc').count()

MODEL

In [13]:
#library imnports - model
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
len(tfidf_vectorizer.get_feature_names())

In [14]:
#classification model pipelines
pipelines = {
    'multinomialnb': make_pipeline(TfidfVectorizer(), MultinomialNB()),
    'logisticregression': make_pipeline(TfidfVectorizer(), LogisticRegression()),
    'randomforestclassifier': make_pipeline(TfidfVectorizer(), RandomForestClassifier()),
    'gradientboostingclassifier': make_pipeline(TfidfVectorizer(), GradientBoostingClassifier())
}
#define hyperparameters for each model 
multinomialnb_hyperparameters = {
    'multinomialnb__alpha': np.linspace(0.5, 1.5, 6),
    'multinomialnb__fit_prior': [True, False]
}
logisticregression_hyperparameters = {
    'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
randomforestclassifier_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 200], 
    'randomforestclassifier__max_features': ['auto', 'sqrt',0.33]
}
gradientboostingclassifier_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100, 200], 
    'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [1, 3, 5]
}
#set hyperparameter dictionary
hyperparameters = {
    'multinomialnb': multinomialnb_hyperparameters,
    'logisticregression': logisticregression_hyperparameters,
    'randomforestclassifier': randomforestclassifier_hyperparameters,
    'gradientboostingclassifier': gradientboostingclassifier_hyperparameters
}

In [None]:
fitted_models = {}

for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1, scoring='roc_auc')
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    confusion_matrix(y_hat, y_test)
    fitted_models[name] = model
    print(name, 'has been fitted.')

for name, model in fitted_models.items(): 
    print(name, model.best_score_)
    y_hat = model.predict(X_test)
    confusion_matrix(y_hat, y_test)

In [None]:

model = RandomForestClassifier(n_estimators=100, max_features=0.33, random_state=0) 
model.fit(tfidf_train, y_train)
y_hat = model.predict(tfidf_test)
print(confusion_matrix(y_hat, y_test))
print(classification_report(y_hat, y_test))

In [None]:
for name, model in fitted_models.items(): 
    print(name, model.best_score_)
    y_hat = model.predict(X_test)
    print(confusion_matrix(y_hat, y_test))
    print(classification_report(y_hat, y_test))

In [None]:
fitted_models['randomforestclassifier']

In [None]:
fitted_models['randomforestclassifier'].best_params_

In [None]:
# Save classifier to a file
import pickle 

save_classifier = open("Tfidf_randomforestclassifier.pickle", 'wb') #wb= write in bytes. 
pickle.dump(fitted_models['randomforestclassifier'], save_classifier) #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format
save_classifier.close() 

In [None]:
randomforestclassifier = fitted_models['randomforestclassifier']

In [None]:
# Retrieve the saved file and uplaod it to an object

model_randomforestclassifier = open("Tfidf_randomforestclassifier.pickle", 'rb') # rb= read in bytes
grid = pickle.load(model_randomforestclassifier)
model_randomforestclassifier.close()

In [15]:
# non-pipeline rf_classifier for convenience
clf = RandomForestClassifier(max_features=0.33, n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
print(confusion_matrix(y_hat, y_test))
print(classification_report(y_hat, y_test))

[[ 97   7]
 [  9 122]]
                precision    recall  f1-score   support

  data analyst       0.92      0.93      0.92       104
data scientist       0.95      0.93      0.94       131

     micro avg       0.93      0.93      0.93       235
     macro avg       0.93      0.93      0.93       235
  weighted avg       0.93      0.93      0.93       235



## Extracting Feature Importances of Each Class

In [36]:
y.value_counts()

data scientist    633
data analyst      542
Name: job_cat, dtype: int64

In [17]:
top_10 = clf.feature_importances_.argsort()[-10:]

In [18]:
top_10_ds = [clf.feature_importances.argsort()[-10:] for y.value_counts()[0]]

for index in top_10_ds:
    print(tfidf_vectorizer.get_feature_names()[index])

SyntaxError: invalid syntax (<ipython-input-18-adfcd779d53d>, line 1)

In [19]:
for index in top_10:
    print(tfidf_vectorizer.get_feature_names()[index])

science
scientist
reporting
data_scientists
reports
learning
data_science
machine_learning
data_analyst
data_scientist


In [20]:
sorted(clf.feature_importances_)[-10:]

[0.012356849606757031,
 0.014717732693692143,
 0.015501973843842942,
 0.016314041955089564,
 0.017472926816099712,
 0.020245186524562135,
 0.08655514909765775,
 0.14632182342387842,
 0.17582859591540084,
 0.19929635783246208]

In [21]:
# feature importances for entire model
first_map = dict(zip(tfidf_vectorizer.get_feature_names(), X_t.toarray()[0]))

In [28]:
dummy_map = dict(zip(sorted(clf.feature_importances_), X_t.toarray()[0])
            
for k, v in dummy_map.items():
    if v > 0:
        print(k, v)

SyntaxError: invalid syntax (<ipython-input-28-0fe7af5a75b8>, line 3)

In [52]:
for k, v in sorted(first_map.items(), key=lambda v: v[1], reverse=False):
    if v > 0.00:
        print(k, v)

team 0.030413982321483155
work 0.030594109073129188
machine_learning 0.03160622239778968
data_analysis 0.03690453107295949
python 0.03729677233643706
computer_science 0.03735365745910442
bachelor 0.03976034134350844
engineering 0.04223051784065325
data_mining 0.043379725245559615
problems 0.04372499673522645
client 0.045095897048819875
ability_work 0.04548377149646243
_python 0.047144629010824717
master 0.048285055276612164
software 0.050449257475217814
work_experience 0.05173609098469197
health 0.05266003472407853
_sql 0.05469942522400743
_machine 0.05771615282167054
concepts 0.05771615282167054
ai 0.05816868219931637
data 0.05834945520066635
healthcare 0.05961487543615992
languages 0.05961487543615992
natural_language 0.05961487543615992
statistical_modeling 0.06066318835828573
use_data 0.060937183690467446
statistical_models 0.06238806452007947
_physics 0.06269595845170343
experience_preferred 0.06301033483476037
high_quality 0.06333147251593664
data_scientist 0.06415810649265051
un

In [45]:
first_map.columns = ['word', 'importance']

AttributeError: 'dict' object has no attribute 'columns'

In [54]:
df_first_map = pd.DataFrame.from_dict(first_map, orient='index')
df_first_map.columns = ['importance']
df_first_map = df_first_map[df_first_map.importance > 0.00]
df_first_map.sort_values(by='importance', ascending=False).head(46)

Unnamed: 0,importance
develop_functionality,0.110152
mathematics_data,0.110152
improvement_responsibilities,0.110152
scale_experimentation,0.110152
scale_real_world,0.110152
scientist_sr,0.110152
applications_apply,0.110152
steps_modeling,0.104097
perform_large,0.104097
driven_interest,0.104097


In [None]:
# vec_ds = 
# vec_da = 
ds_map = dict(zip([tfidf_vectorizer.get_feature_names() for i in df_filteredbyloc[df_filteredbyloc['job_cat']=='data scientist'].index], [X_t.toarray()[0] for y in df_filteredbyloc[df_filteredbyloc['job_cat']=='data scientist'].index]))
da_map = dict(zip([tfidf_vectorizer.get_feature_names() for i in df_filteredbyloc[df_filteredbyloc['job_cat']=='data analyst'].index], [X_t.toarray()[0] for y in df_filteredbyloc[df_filteredbyloc['job_cat']=='data analyst'].index]))
# dict(zip(tfidf_vectorizer.get_feature_names(), X_t.toarray()[0]))

In [None]:
for k, v in ds_map.items():
    if v > 0:
        print(k, v)

In [None]:
for k, v in da_map.items():
    if v > 0:
        print(k, v)

In [None]:
y.value_counts()[1]

In [None]:
# feature importances for ['data scientist']
# dspos_map = dict(zip()
print(tfidf_vectorizer.dtype)
print(X_t.dtype)

In [None]:
# feature importances for ['data engineer']

### Technique 1, (Source: https://buhrmann.github.io/tfidf-analysis.html)

In [None]:
clf_rf.named_steps

In [None]:
X_train.values[0]

In [None]:
X.shape
# type(X_train)
len(X.shape) == 0

In [None]:
# Xtr = pipelines['randomforestclassifier'].fit_transform(list(X.values))
Xtr = X_t.toarray()
vec = clf.named_steps['tfidfvectorizer']
features = vec.get_feature_names()

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [None]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [None]:
top_tfidf_feats(X[0], features, top_n=25)
# top_feats_in_doc(Xtr, features, row_id, top_n=25)
# top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25)
# top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25)
# plot_tfidf_classfeats_h(dfs)

In [None]:
top_10 = clf_rf.named_steps.randomforestclassifier.feature_importances_.argsort()[-10:]

In [None]:
for index in top_10:
    print(tfidf_train.get_feature_names()[index])

### Technique 2 

In [None]:
clf_rf.named_steps.randomforestclassifier.feature_importances_

In [None]:
feat_importances = pd.DataFrame(clf_rf.named_steps.randomforestclassifier.feature_importances_, index=X_train.columns, columns=['Score']) #creating a list of top 10 features from RF model
feat_importances = feat_importances.sort_values(by='Score',ascending=True) #sorting values
feat_importances.plot(kind='barh') #plotting the features in a horizontal bar chart
plt.show()

### Technique 3

In [None]:
vectorizer = fitted_models.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset:
X_test_set = vectorizer.transform(X_test)


# find maximum value for each of the features over dataset:
max_value = X_test_set.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

# get feature names
feature_names = np.array(vectorizer.get_feature_names())

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))

print("\nFeatures with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

### Technique 4

In [None]:
from sklearn.feature_selection import SelectFromModel
# model_randomforestclassifier.Pipeline.named_steps
importances = model.feature_importances_
# type(randomforestclassifier)
# model2 = SelectFromModel(model, prefit=True)
# X_new = model2.transform(tfidf_train)

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(tfidf_train.shape[0]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(tfidf_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(tfidf_train.shape[1]), indices)
plt.xlim([-1, tfidf_train.shape[1]])
plt.show()

In [None]:
def show_most_informative_features(model, text=None, n=20):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['tfidfvectorizer']
    classifier = model.named_steps['randomforestclassifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append(
            "Classified as: {}".format(model.predict([text]))
        )
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)

In [None]:
show_most_informative_features(model)

INTERPRET

In [None]:
import re
sentence = 'he said that she said "hello".'
pattern = 'he'
p = re.compile(pattern)
p.findall(sentence)