# Modeling (Title only)

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import natural language toolkit
import nltk

# Import tokenizer
from nltk.tokenize import RegexpTokenizer

# Import lemmatizer
from nltk.stem import WordNetLemmatizer

# Import regular expression
import re

# Import wordcloud 
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Import Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Import sparse to make matrix sparse (where most of the elements are zero)
from scipy import sparse

In [2]:
#setting the display options

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
#reading the datafile for Text Preprocessing

df = pd.read_csv('../data/df.csv', converters={'author': eval, 'program': eval})
df.head()

Unnamed: 0.1,Unnamed: 0,paper,year,month,title,author,code,program
0,73,74,1975,March,Variation Across Household in the Rate of Inflation,[Robert T Michael],w00074,[Economic Fluctuations and Growth]
1,86,87,1975,May,Exports and Foreign Investment in the Pharmaceutical Industry,"[Merle Yahr Weiss, Robert E Lipsey]",w00087,"[International Trade and Investment, International Finance and Macroeconomics]"
2,106,107,1975,October,Social Security and Retirement Decisions,[Michael J Boskin],w00107,[Public Economics]
3,115,116,1975,November,Notes on the Tax Treatment of Human Capital,[Michael J Boskin],w00116,[Public Economics]
4,116,117,1980,April,Job Mobility and Earnings Growth,[Ann P Bartel],w00117,[Labor Studies]


In [4]:
#dropping the column = Unnamed:0
df.drop(columns='Unnamed: 0', axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,paper,year,month,title,author,code,program
0,74,1975,March,Variation Across Household in the Rate of Inflation,[Robert T Michael],w00074,[Economic Fluctuations and Growth]
1,87,1975,May,Exports and Foreign Investment in the Pharmaceutical Industry,"[Merle Yahr Weiss, Robert E Lipsey]",w00087,"[International Trade and Investment, International Finance and Macroeconomics]"
2,107,1975,October,Social Security and Retirement Decisions,[Michael J Boskin],w00107,[Public Economics]
3,116,1975,November,Notes on the Tax Treatment of Human Capital,[Michael J Boskin],w00116,[Public Economics]
4,117,1980,April,Job Mobility and Earnings Growth,[Ann P Bartel],w00117,[Labor Studies]


In [6]:
#instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
#tokenizing the title of working papers
df['title'] = df['title'].apply(lambda x: tokenizer.tokenize(x))

In [8]:
#Create stopword list
#add new words to the stopwords
stopwords = set(STOPWORDS)
new_words = ["may","aren", "couldn", "didn", "doesn", "don", "hadn", "hasn", "haven", "isn", "let", 
                  "ll", "mustn", "re", "shan", "shouldn", "ve", "wasn", "weren", "won", "wouldn", "t",
            "within","upon", "greater","effect","new", "the"]
stopwords = stopwords.union(new_words)

In [9]:
#instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
#function to lemmatize the title text
def word_lemmatizer(title):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in title if not i in stopwords])
    return lem_text

In [11]:
#applying the lemmatizer and checking the title column
df['title'] = df['title'].apply(lambda x: word_lemmatizer(x))
df['title'].head()

0    Variation Across Household Rate Inflation         
1    Exports Foreign Investment Pharmaceutical Industry
2    Social Security Retirement Decisions              
3    Notes Tax Treatment Human Capital                 
4    Job Mobility Earnings Growth                      
Name: title, dtype: object

In [12]:
#joining all titles
title_text = " ".join(text for text in df['title'])

In [13]:
#instantiate the Count Vectorizer
cvec = CountVectorizer(max_df=0.8,stop_words=stopwords, max_features=15000, ngram_range=(2,3))

#construct the required CVEC by fitting anf tranforming the data
cvec_matrix = cvec.fit_transform(df['title'])

#output the shape of CVEC_matrix
cvec_matrix.shape

(20696, 15000)

In [14]:
#checking the vocabulary in the matrix
# print(cvec.get_feature_names())

In [15]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(max_df=0.8,stop_words=stopwords, max_features=15000,ngram_range=(2, 3),)

# #Replace NaN with an empty string
# metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['title'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(20696, 15000)

In [16]:
# Compute the cosine similarity matrix
cosine_sim_1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [17]:
cosine_sim_1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [18]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(cvec_matrix, cvec_matrix)

In [19]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

### Recommender System using Count Vectorizer

In [20]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Function that takes in paper title as input and outputs most similar papers
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the wp that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all papers with that paper
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the wp based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar wp
    sim_scores = sim_scores[1:11]

    # Get the wp indices
    wp_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar working papers
    return pd.DataFrame({'year_published': df['year'].iloc[wp_indices],
                         'author': df['author'].iloc[wp_indices],
                         'title': df['title'].iloc[wp_indices],
                        'program': df['program'].iloc[wp_indices]})

In [21]:
get_recommendations('Exports Foreign Investment Pharmaceutical Industry', cosine_sim=cosine_sim)

Unnamed: 0,year_published,author,title,program
6,1976,"[Merle Yahr Weiss, Robert E Lipsey]",Exports Foreign Investment Manufacturing Industries,"[International Trade and Investment, International Finance and Macroeconomics]"
2821,1989,[Joosung Jun],What Marginal Source Funds Foreign Investment,[Public Economics]
4609,1994,"[Elhanan Helpman, Gene M Grossman]",Foreign Investment Endogenous Protection,[International Trade and Investment]
4700,1994,"[Ann E Harrison, Brian Aitken, Gordon H Hanson]",Spillovers Foreign Investment Export Behavior,[International Trade and Investment]
2472,1988,"[Guy V G Stevens, Robert E Lipsey]",Interactions Domestic Foreign Investment,"[International Trade and Investment, International Finance and Macroeconomics]"
3369,1991,[Magnus Blomstrom],Host Country Benefits Foreign Investment,"[International Trade and Investment, International Finance and Macroeconomics]"
4854,1995,"[Gordon H Hanson, Robert C Feenstra]",Foreign Investment Outsourcing Relative Wages,[International Trade and Investment]
5894,1997,[Fiona M Scott Morton],Entry Decisions Generic Pharmaceutical Industry,[Industrial Organization]
11960,2006,"[C Fritz Foley, James R Hines, Jr, Mihir A Desai]",Capital Structure Risky Foreign Investment,"[Corporate Finance, International Trade and Investment]"
19974,2014,"[Keith M Drake, Martha A Starr, Thomas G McGuire]",Do Reverse Payment Settlements Brand Generic Patent Disputes Pharmaceutical Industry Constitute Anticompetitive Pay Delay,"[Health Care, Law and Economics]"


### Recommender System using Tf-idf Vectorizer

In [22]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Function that takes in paper title as input and outputs most similar papers
def get_recommendations_1(title, cosine_sim_1=cosine_sim_1):
    # Get the index of the wp that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all papers with that paper
    sim_scores_1 = list(enumerate(cosine_sim_1[idx]))

    # Sort the wp based on the similarity scores
    sim_scores_1 = sorted(sim_scores_1, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar wp
    sim_scores_1 = sim_scores_1[1:11]

    # Get the wp indices
    wp_indices_1 = [i[0] for i in sim_scores_1]

    # Return the top 10 most similar working papers
    return pd.DataFrame({'year_published': df['year'].iloc[wp_indices_1],
                         'author': df['author'].iloc[wp_indices_1],
                         'title': df['title'].iloc[wp_indices_1],
                        'program': df['program'].iloc[wp_indices_1]})

In [23]:
get_recommendations_1('Exports Foreign Investment Pharmaceutical Industry', cosine_sim_1=cosine_sim_1)

Unnamed: 0,year_published,author,title,program
6,1976,"[Merle Yahr Weiss, Robert E Lipsey]",Exports Foreign Investment Manufacturing Industries,"[International Trade and Investment, International Finance and Macroeconomics]"
2821,1989,[Joosung Jun],What Marginal Source Funds Foreign Investment,[Public Economics]
4609,1994,"[Elhanan Helpman, Gene M Grossman]",Foreign Investment Endogenous Protection,[International Trade and Investment]
4700,1994,"[Ann E Harrison, Brian Aitken, Gordon H Hanson]",Spillovers Foreign Investment Export Behavior,[International Trade and Investment]
5894,1997,[Fiona M Scott Morton],Entry Decisions Generic Pharmaceutical Industry,[Industrial Organization]
19974,2014,"[Keith M Drake, Martha A Starr, Thomas G McGuire]",Do Reverse Payment Settlements Brand Generic Patent Disputes Pharmaceutical Industry Constitute Anticompetitive Pay Delay,"[Health Care, Law and Economics]"
11960,2006,"[C Fritz Foley, James R Hines, Jr, Mihir A Desai]",Capital Structure Risky Foreign Investment,"[Corporate Finance, International Trade and Investment]"
4854,1995,"[Gordon H Hanson, Robert C Feenstra]",Foreign Investment Outsourcing Relative Wages,[International Trade and Investment]
3369,1991,[Magnus Blomstrom],Host Country Benefits Foreign Investment,"[International Trade and Investment, International Finance and Macroeconomics]"
2472,1988,"[Guy V G Stevens, Robert E Lipsey]",Interactions Domestic Foreign Investment,"[International Trade and Investment, International Finance and Macroeconomics]"
