### Scrapping data from web

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from bs4 import BeautifulSoup
import urllib2, requests

#### Functions to extract location , company, job_title, summary and salary

In [2]:
def extract_text(item):
    if item:
        return item.text.strip()
    else:
        return ''
    
def get_title_from_result(result):
    return result.find('a',{'data-tn-element': 'jobTitle'}).text.strip()

    
def get_company_from_result(result):
    return extract_text(result.find('span',{'class': 'company'}))

def get_location_from_result(result):
    return extract_text(result.find('span',{'class': 'location'}))

def get_summary_from_result(result):
    return extract_text(result.find('span',{'class': 'summary'}))

    

In [3]:
# url="http://www.indeed.com/jobs?q=data+scientist+${}%2C000&l=USA&start={}"
# max_result_per_city=6000

# rows=[]
# for salary in set(['55-70','70-85','85-100','100-115']):
#     for start in range(10):
#         r=requests.get(url.format(salary, start))
#         soup=BeautifulSoup(r.content)
#         results=soup.findAll('div',{'class':  'result'})
#         for result in results:
#             if result:
#                 row={}
#                 row['title']=get_title_from_result(result)
#                 row['company']=get_company_from_result(result)
#                 row['city']=get_location_from_result(result)
#                 row['summary']=get_summary_from_result(result)
#                 row['bin']=salary
                
#                 rows.append(row) 

In [4]:
# result.to_csv('indeed_companies.csv',encoding='utf-8')
# data_df=pd.DataFrame(rows)
# data_df.head()

In [5]:
# data_df.to_csv('indeed_companies.csv',encoding='utf-8')                                                                                        

In [6]:
pd.set_option('max_colwidth',200)
new_df=pd.read_csv('indeed_companies.csv')
new_df[new_df.bin=="55-70"]

Unnamed: 0.1,Unnamed: 0,bin,city,company,summary,title
300,300,55-70,Hawaii,State of Hawaii,"And preparing summaries or reports with data presented in graphic, narrative or numerical form. Modifying and applying research and statistical methods and...",RESEARCH STATISTICIAN IV
301,301,55-70,Illinois,Intel,"This includes the LAN, WAN, telephony, data centers, client PCs, backup and restore, and enterprise applications. Job Code valid only for Israel....",Data Scientist
302,302,55-70,"Frederick, MD 21702",Leidos,The Biological Life Sciences Operation of the Health Group is seeking a Biomedical Data Scientist who will perform as a productive member of a team that...,Biomedical Data Scientist
303,303,55-70,"South San Francisco, CA 94080",Incedo,Statistics and Data Science knowledge. Must needed exo in below skills and technology:....,Data Scientist
304,304,55-70,"San Francisco, CA",Gameloft,"Job Description Responsibilities: As User Acquisition/Data Analysis Intern, reporting to the Sr User Acquisition Manager, you will focus on marketing actions",User Acquisition/Data Analysis
305,305,55-70,"Bellevue, WA",Microsoft,Experiences in build and maintaining data pipelines. We are looking for applied scientists who have following required experience:....,Data Scientist II
306,306,55-70,"Springfield, IL 62715",Horace Mann Service Corp,Under minimal supervision and utilizing established or innovative procedures the incumbent develops and present to management for their approval proposals,Data Scientist
307,307,55-70,"Springfield, VA","Ferring Pharmaceuticals, Inc.",Plans data collection; Data collection and methodologies. Adjusts and weighs raw data; And interprets a wide variety of data....,Data Scientist
308,308,55-70,"Irvine, CA",Appriss,"Access and manipulate data from various data sources such as Netezza, Hadoop, Greenplum, SQL Server, raw files, and SAS....",Data Scientist/Statistician
309,309,55-70,"Wilmington, MA",CHARLES RIVER ENGLISH,"The Data Scientist will participate in strategic business initiatives by researching, analyzing and interpreting data;...",Intern - IT Data Scientist


#### Checking the contain of other bins

In [7]:
new_df.bin.value_counts()

85-100     100
100-115    100
55-70      100
70-85      100
Name: bin, dtype: int64

### Cleaning data
#### Importing libraries

In [8]:
import nltk
import nltk.stem
import nltk.stem.porter
import nltk.stem.snowball
snowball=nltk.stem.snowball.SnowballStemmer("english")
from nltk.corpus import stopwords
import re
from nltk.probability import FreqDist

### Subsetting data from Dataframe over categories(bins) converting into strings and assigning into new variables for each bin

In [9]:
junior=str(new_df[new_df['bin'].isin(['55-70'])].summary)
mid_low=str(new_df[new_df['bin'].isin(['70-85'])].summary)
mid_high=str(new_df[new_df['bin'].isin(['85-100'])].summary)
senior=str(new_df[new_df['bin'].isin(['100-115'])].summary)

In [10]:
### removing all the uppercase character
junior=junior.lower()
mid_low=mid_low.lower()
mid_high=mid_high.lower()
senior=senior.lower()

#### Cleaning it from numbers ,charaters and stopwords.Stopwords are meaningless words such as the, is he, she etc.

In [11]:
junior_clean=re.sub("[^a-zA-Z]", " ", junior)
mid_low_clean=re.sub("[^a-zA-Z]", " ", mid_low)
mid_high_clean=re.sub("[^a-zA-Z]", " ", mid_high)
senior_clean=re.sub("[^a-zA-Z]", " ", senior)

stop=set(stopwords.words("english"))

# updating the stopwords because in our case  data and scientist are not meaningful as well
stop.update([',', '"', "'", '?', '!', ':',';','{','}','(',')','[',']','...','Data','data','Scientist','scientist','ii','abc'])

In [12]:
#### Tokemizing using nltk library and removing stpwords
junior_words=[w for w in (nltk.word_tokenize(junior_clean)) if w.lower() not in stop]
mid_low_words=[w for w in (nltk.word_tokenize(mid_low_clean)) if w.lower() not in stop]
mid_high_words=[w for w in (nltk.word_tokenize(mid_high_clean)) if w.lower() not in stop]
senior_words=[w for w in (nltk.word_tokenize(senior_clean)) if w.lower() not in stop]

In [13]:
print len(senior_clean)
print len(nltk.word_tokenize(senior_clean))
print len(senior_words)

10479
1259
693


In [14]:
### Lets have a look at these words
junior_words[:100]

['preparing',
 'summaries',
 'reports',
 'presented',
 'graphic',
 'narrative',
 'numerical',
 'form',
 'modifying',
 'applying',
 'research',
 'statistical',
 'methods',
 'includes',
 'lan',
 'wan',
 'telephony',
 'centers',
 'client',
 'pcs',
 'backup',
 'restore',
 'enterprise',
 'applications',
 'job',
 'code',
 'valid',
 'israel',
 'biological',
 'life',
 'sciences',
 'operation',
 'health',
 'group',
 'seeking',
 'biomedical',
 'perform',
 'productive',
 'member',
 'team',
 'statistics',
 'science',
 'knowledge',
 'must',
 'needed',
 'exo',
 'skills',
 'technology',
 'job',
 'description',
 'responsibilities',
 'user',
 'acquisition',
 'analysis',
 'intern',
 'reporting',
 'sr',
 'user',
 'acquisition',
 'manager',
 'focus',
 'marketing',
 'actions',
 'experiences',
 'build',
 'maintaining',
 'pipelines',
 'looking',
 'applied',
 'scientists',
 'following',
 'required',
 'experience',
 'minimal',
 'supervision',
 'utilizing',
 'established',
 'innovative',
 'procedures',
 'incumb

In [15]:
junior_list=sorted(junior_words, key=junior_words.count, reverse=True)
fdist_junior=FreqDist(junior_list)
junior_df=pd.DataFrame(list(fdist_junior.iteritems()),columns=['Words','Value'])
junior_df=junior_df.sort_values(by="Value",axis=0, ascending=False)
junior_df.head()                  

Unnamed: 0,Words,Value
93,technical,20
168,development,15
100,software,15
37,advanced,15
71,health,11


In [16]:
mid_low_list=sorted(mid_low_words, key=mid_low_words.count, reverse=True)
fdist_mid_low=FreqDist(mid_low_list)
mid_low_df=pd.DataFrame(list(fdist_mid_low.iteritems()),columns=['Words','Value'])
mid_low_df=mid_low_df.sort_values(by="Value",axis=0, ascending=False)
mid_low_df.head()

Unnamed: 0,Words,Value
91,experience,17
79,business,12
180,analysis,12
6,sources,11
73,various,11


In [17]:
mid_high_list=sorted(mid_high_words, key=mid_high_words.count, reverse=True)
fdist_mid_high=FreqDist(mid_high_list)
mid_high_df=pd.DataFrame(list(fdist_mid_high.iteritems()),columns=['Words','Value'])
mid_high_df=mid_high_df.sort_values(by="Value",axis=0, ascending=False)
mid_high_df.head()

Unnamed: 0,Words,Value
91,experience,22
109,mining,18
78,business,12
12,statistical,12
52,methods,11


In [18]:
senior_list=sorted(senior_words, key=senior_words.count, reverse=True)
fdist_senior=FreqDist(senior_list)
senior_df=pd.DataFrame(list(fdist_senior.iteritems()),columns=['Words','Value'])
senior_df=senior_df.sort_values(by="Value",axis=0, ascending=False)
senior_df.head()  

Unnamed: 0,Words,Value
114,mining,22
130,science,17
94,experience,17
79,business,16
12,statistical,12


### now lets have a look at the bigrams and trigrams as they are more meaningful.

**Highlighting most common bigrams and trigrams fro each of category**

In [19]:
jun_pairs=["".join(pair) for pair in nltk.bigrams(junior_words)]
jun_trigrams=sorted(["".join(pair) for pair in nltk.trigrams(junior_words)])
fdist_jun_pairs=FreqDist(jun_pairs)
fdist_jun_trigrams=FreqDist(jun_trigrams)

jun_bigrams=new_df.from_dict(fdist_jun_pairs,orient='index', dtype=int)
jun_bigrams.columns=['Value']
jun_bigrams.sort_values(by='Value',axis=0, ascending=False)

Unnamed: 0,Value
advancedtechnical,10
descriptionresponsibilities,6
analyzinginterpreting,6
jobdescription,6
databasedesign,5
proposalspublications,5
unstructuredinformation,5
interpretfindings,5
valueanalyzation,5
incorporatingnew,5


In [20]:
mid_low_pairs=["".join(pair) for pair in nltk.bigrams(mid_low_words)]
mid_low_trigrams=sorted(["".join(pair) for pair in nltk.trigrams(mid_low_words)])
fdist_mid_low_pairs=FreqDist(mid_low_pairs)
fdist_mid_low_trigrams=FreqDist(mid_low_trigrams)

mid_low_bigrams=new_df.from_dict(fdist_mid_low_pairs,orient='index', dtype=int)
mid_low_bigrams.columns=['Value']
mid_low_bigrams.sort_values(by='Value',axis=0, ascending=False)

Unnamed: 0,Value
varioussources,10
experiencemodeling,6
sasworking,5
transparencyunderstanding,5
mindsetproduce,5
useadvanced,5
techniquesidentify,5
jsggplot,5
collectionmethodologies,5
netezzahadoop,5


In [21]:
mid_high_pairs=["".join(pair) for pair in nltk.bigrams(mid_high_words)]
mid_high_trigrams=sorted(["".join(pair) for pair in nltk.trigrams(mid_high_words)])
fdist_mid_high_pairs=FreqDist(mid_high_pairs)
fdist_mid_high_trigrams=FreqDist(mid_high_trigrams)

mid_high_bigrams=new_df.from_dict(fdist_mid_high_pairs,orient='index', dtype=int)
mid_high_bigrams.columns=['Value']
mid_high_bigrams.sort_values(by='Value',axis=0, ascending=False)

Unnamed: 0,Value
useracquisition,10
businessinsights,5
setscombining,5
methodsperform,5
establishedinnovative,5
experiencetraditional,5
analysisintern,5
performadhoc,5
maintainingpipelines,5
actionsefine,5


In [22]:
sen_pairs=["".join(pair) for pair in nltk.bigrams(senior_words)]
sen_trigrams=sorted(["".join(pair) for pair in nltk.trigrams(senior_words)])
fdist_sen_pairs=FreqDist(sen_pairs)
fdist_sen_trigrams=FreqDist(sen_trigrams)

sen_bigrams=new_df.from_dict(fdist_sen_pairs,orient='index', dtype=int)
sen_bigrams.columns=['Value']
sen_bigrams.sort_values(by='Value',axis=0, ascending=False)

Unnamed: 0,Value
useracquisition,10
businessintelligence,6
communicatingscientists,5
miningjob,5
extractmanipulate,5
visualisationtools,5
stateart,5
engineersefine,5
roleentail,5
architectingexecuting,5


In [23]:
import numpy as np
frames=[junior_df, mid_low_df,mid_high_df, senior_df]
whole_df=pd.concat(frames, axis=0, ignore_index=False)

new_list_jun=[]
for word in list(whole_df.Words):
    if word in list(junior_df.Words):
        new_list_jun.append(1)
    else:
        new_list_jun.append(0)
whole_df['junior']=new_list_jun 


new_list_mid_low=[]
for word in list(whole_df.Words):
    if word in list(mid_low_df.Words):
        new_list_mid_low.append(1)
    else:
        new_list_mid_low.append(0)
whole_df['mid_low']=new_list_mid_low 

new_list_mid_high=[]
for word in list(whole_df.Words):
    if word in list(mid_high_df.Words):
        new_list_mid_high.append(1)
    else:
        new_list_mid_high.append(0)
whole_df['mid_high']=new_list_mid_high 

new_list_senior=[]
for word in list(whole_df.Words):
    if word in list(senior_df.Words):
        new_list_senior.append(1)
    else:
        new_list_senior.append(0)
whole_df['senior']=new_list_senior
whole_df.head(10)

Unnamed: 0,Words,Value,junior,mid_low,mid_high,senior
93,technical,20,1,0,0,0
168,development,15,1,0,0,0
100,software,15,1,0,0,0
37,advanced,15,1,1,0,0
71,health,11,1,1,1,1
7,sources,11,1,1,1,1
96,experience,11,1,1,1,1
107,analyzing,11,1,1,0,0
58,design,10,1,0,0,0
55,findings,10,1,0,0,0


### MOdelling

In [24]:
import sklearn.preprocessing
import sklearn.feature_extraction
tfidf=sklearn.feature_extraction.text.TfidfVectorizer()

In [25]:
X=tfidf.fit_transform(whole_df.Words)
y=whole_df.junior

In [26]:
X

<770x330 sparse matrix of type '<type 'numpy.float64'>'
	with 770 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33)



In [28]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((515, 330), (515,), (255, 330), (255,))

In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [32]:
y_pred=knn.predict(X_test)

In [33]:
y_pred

array([1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1], dtype=int64)

In [34]:
print 'Accuracy score :" %s' % accuracy_score(y_test, y_pred)

Accuracy score :" 0.843137254902


In [37]:
from sklearn.cross_validation import cross_val_score
cross_val_score(knn, X, y, cv=10, scoring="accuracy")

array([ 0.79220779,  0.90909091,  0.90909091,  0.85714286,  0.92207792,
        0.7012987 ,  0.72727273,  0.90909091,  0.92207792,  0.83116883])

In [38]:
confusion_matrix(y_test,y_pred)

array([[ 91,   0],
       [ 40, 124]])