# Project2_Taewoo Kim_20233219

## 1. Reading raw files and checking dataset

In [1]:
import os
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import string, spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from gensim.corpora.dictionary import Dictionary
from gensim import models
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models, utils
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem.porter import PorterStemmer

pd.set_option('mode.chained_assignment',  None) #경고 끄기
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) #경고끄기

In [2]:
text_names = sorted([text for text in os.listdir('pjt2_data/') if text.endswith('.xls')])
text_raw = pd.concat([pd.read_excel('pjt2_data/' + file) for file in text_names])
text_raw.to_csv('text_raw.csv', index=False)

In [3]:
NLP_data = text_raw[['Article Title', 'Author Keywords', 'Keywords Plus', 'Abstract', 'Affiliations', 'Publication Year', 'Journal ISO Abbreviation']]

In [4]:
NLP_data1 = NLP_data.rename({'Article Title': 'Title', 'Author Keywords': 'Akwds',
               'Keywords Plus': 'Kwds', 'Abstract': 'Abst', 'Affiliations': 'Affil',
               'Publication Year': 'Year', 'Journal ISO Abbreviation': 'Jabb'}, axis=1)

NLP_data1.head()

Unnamed: 0,Title,Akwds,Kwds,Abst,Affil,Year,Jabb
0,PACING STRATEGIC CHANGE - THE CASE OF A NEW VE...,,PERFORMANCE; TIME; MANAGEMENT; DECISIONS; FIRM,Theorists debate whether organizations are ine...,,1994.0,Acad. Manage. J.
1,GENDER DIFFERENCES IN MANAGERS DEVELOPMENTAL J...,,RATIONAL BIAS; DISCRIMINATION; EMPLOYMENT; IMP...,It has been suggested that one reason so few w...,,1994.0,Acad. Manage. J.
2,INVESTIGATING THE GLASS CEILING PHENOMENON - A...,,SELECTION DECISION-MODELS; MANAGERIAL POSITION...,Although a ''glass ceiling'' is said to keep w...,University of Massachusetts System; University...,1994.0,Acad. Manage. J.
3,BRINGING THE INDIVIDUAL BACK IN - A STRUCTURAL...,,PERFORMANCE-APPRAISAL RESEARCH; INFORMANT ACCU...,We challenge the claimed incommensurability of...,Carnegie Mellon University,1994.0,Acad. Manage. J.
4,EXTENDING MODERN PORTFOLIO THEORY INTO THE DOM...,,MODERN FINANCIAL THEORY; SYSTEMATIC-RISK; CAPI...,It is widely held that diversification lowers ...,Case Western Reserve University,1994.0,Acad. Manage. J.


In [5]:
NLP_data.columns

Index(['Article Title', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Affiliations', 'Publication Year', 'Journal ISO Abbreviation'],
      dtype='object')

In [6]:
NLP_data1.columns

Index(['Title', 'Akwds', 'Kwds', 'Abst', 'Affil', 'Year', 'Jabb'], dtype='object')

In [7]:
NLP_data1.isna().sum()

Title       0
Akwds    2501
Kwds     1088
Abst      683
Affil     841
Year      599
Jabb        0
dtype: int64

In [8]:
NLP_data1.dtypes

Title     object
Akwds     object
Kwds      object
Abst      object
Affil     object
Year     float64
Jabb      object
dtype: object

In [9]:
NLP_data2 = NLP_data1[NLP_data1['Abst'].notna()]

In [10]:
NLP_data2.isna().sum()

Title       0
Akwds    1829
Kwds      442
Abst        0
Affil     507
Year      599
Jabb        0
dtype: int64

In [11]:
NLP_data2['Year'] = NLP_data2['Year'].fillna(2023)

In [12]:
NLP_data2.isna().sum()

Title       0
Akwds    1829
Kwds      442
Abst        0
Affil     507
Year        0
Jabb        0
dtype: int64

In [13]:
NLP_data2.groupby('Jabb')['Title'].nunique()

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5398
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Title, dtype: int64

In [14]:
NLP_data2.groupby('Jabb')['Title'].count() # nunique와 count 개수 다르면 중복값이 있다는 것

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5400
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Title, dtype: int64

In [15]:
NLP_data3 = NLP_data2.drop_duplicates(subset = 'Title') # Title 중복값 제거

In [16]:
NLP_data3.groupby('Jabb')['Title'].count()

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5398
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Title, dtype: int64

In [17]:
NLP_data3.groupby('Jabb')['Abst'].nunique()

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5395
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Abst, dtype: int64

In [18]:
NLP_data3.groupby('Jabb')['Abst'].count() # nunique와 count 개수 다르면 중복값이 있다는 것

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5398
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Abst, dtype: int64

In [19]:
NLP_data4 = NLP_data3.drop_duplicates(subset = 'Abst') # Abst 중복값 제거

In [20]:
NLP_data4.groupby('Jabb')['Abst'].count()

Jabb
Acad. Manage. J.       1774
Manage. Sci.           5395
Organ Sci.             1781
Strateg. Manage. J.    2282
Name: Abst, dtype: int64

In [21]:
NLP_data4['Year'] = NLP_data4['Year'].astype('int')
NLP_data4.dtypes

Title    object
Akwds    object
Kwds     object
Abst     object
Affil    object
Year      int32
Jabb     object
dtype: object

In [22]:
NLP_data4.isna().sum()

Title       0
Akwds    1829
Kwds      442
Abst        0
Affil     507
Year        0
Jabb        0
dtype: int64

In [23]:
NLP_data5 = NLP_data4.fillna('').astype(str)

In [24]:
NLP_data5.to_csv('pjt2_data/NLP_finn.csv', index=False)