In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Displays Other language characters correctly if exists
encode_type = 'maccentraleurope'  

journals = pd.read_csv("journals_out.csv", encoding = encode_type)

journals.head()

Unnamed: 0,issn,journal_name,pub_name,is_hybrid,category,url
0,1654-9880,Global Health Action,Co-Action Publishing,0.0,Medicine,
1,2278-9480,International Journal of Applied Research & St...,,0.0,,
2,1980-5918,Fisioterapia Em Movimento,PontifÔŅĹcia Universidade CatÔŅĹlica do ParanÔŅĹ,0.0,Medicine,http://www.pucpr.br/revfisio
3,1121-760X,European Journal of Histochemistry,,0.0,,
4,2319-5754,Species,Discovery Publication,0.0,,


In [3]:
journals.shape


(105496, 6)

In [4]:
journals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105496 entries, 0 to 105495
Data columns (total 6 columns):
issn            105492 non-null object
journal_name    105492 non-null object
pub_name        93631 non-null object
is_hybrid       105496 non-null float64
category        54743 non-null object
url             51095 non-null object
dtypes: float64(1), object(5)
memory usage: 4.8+ MB


In [5]:
# Counting number of null values in each columns
journals.isnull().sum()

issn                4
journal_name        4
pub_name        11865
is_hybrid           0
category        50753
url             54401
dtype: int64

For 2 columns theres are 50% values are missing . But it is still early to decide whether to drop the columns or replace them with some values. Lets do more analysis .

In [6]:
#There are four rows for which the issn value is null
journals.loc[journals['issn'].isnull()]

Unnamed: 0,issn,journal_name,pub_name,is_hybrid,category,url
3706,,,,10.0,,
46971,,,,10.0,,
82463,,,,10.0,,
82957,,,,10.0,,


Dropping the 4 rows

In [7]:
rows_to_drop = journals.loc[journals['issn'].isnull()].index
print("Row indices to drop: " , rows_to_drop)

journals.drop(rows_to_drop, inplace = True)

journals.isnull().sum()

Row indices to drop:  Int64Index([3706, 46971, 82463, 82957], dtype='int64')


issn                0
journal_name        0
pub_name        11861
is_hybrid           0
category        50749
url             54397
dtype: int64

In [8]:
#Checking journal categories
print(journals.category.unique())

print("-----------")

unique_categories_previous = len(journals.category.unique())

#Checking unique cateogories
print("Unique Categories :" , unique_categories_previous)

['Medicine' nan 'Science' 'Education' 'Law' 'Agriculture'
 'Social Sciences' 'General Works' 'Geography. Anthropology. Recreation'
 'History (General) and history of Europe' 'Technology' 'Fine Arts'
 'Political science' 'Language and Literature'
 'Philosophy. Psychology. Religion' 'Auxiliary sciences of history'
 'Veterinary' 'History America' 'STRUCTURAL ENGINEERING' 'Anthropology'
 'Agriculture | Science' 'Ecology and Evolution' 'MEDICINE'
 'Sports Medicine' 'Literary Studies' 'Naval Science' 'VETERINARY'
 'Technology | Philosophy. Psychology. Religion' 'Neuroscience'
 'Mathematics' 'ECOLOGY AND EVOLUTION' 'Fine Arts | Science'
 'MOLECULAR AND CELL BIOLOGY' 'Molecular and Cell Biology'
 'Education | Geography. Anthropology. Recreation'
 'Fine Arts | Language and Literature'
 'Geography. Anthropology. Recreation | Science' 'Education | Medicine'
 'ONCOLOGY' 'Agriculture | Technology | Science' 'Circuits'
 'Political science | Social Sciences' 'Music and books on Music'
 'Medicine | Sc

From the above data, it is clear that the category names are not uniform in the letter case which means there is a possib;ity of having a duplicate entry based on lower case and upper case.

In [9]:

# Capitalizing each word
print((journals.category.str.title()).unique())

print("\n-----------\n")

# Checking the distribution of categories
print('Unique categories before capitalization: ' , unique_categories_previous)
print('Unique categories after capitalization : ', len((journals.category.str.title()).unique()))

['Medicine' nan 'Science' 'Education' 'Law' 'Agriculture'
 'Social Sciences' 'General Works' 'Geography. Anthropology. Recreation'
 'History (General) And History Of Europe' 'Technology' 'Fine Arts'
 'Political Science' 'Language And Literature'
 'Philosophy. Psychology. Religion' 'Auxiliary Sciences Of History'
 'Veterinary' 'History America' 'Structural Engineering' 'Anthropology'
 'Agriculture | Science' 'Ecology And Evolution' 'Sports Medicine'
 'Literary Studies' 'Naval Science'
 'Technology | Philosophy. Psychology. Religion' 'Neuroscience'
 'Mathematics' 'Fine Arts | Science' 'Molecular And Cell Biology'
 'Education | Geography. Anthropology. Recreation'
 'Fine Arts | Language And Literature'
 'Geography. Anthropology. Recreation | Science' 'Education | Medicine'
 'Oncology' 'Agriculture | Technology | Science' 'Circuits'
 'Political Science | Social Sciences' 'Music And Books On Music'
 'Medicine | Science' 'Infectious Diseases'
 'Military Science | Political Science' 'Physics 

In [10]:
print(journals.is_hybrid.unique())

[ 0.  1. 10.]


 It is clear that there are duplicates because of the capitalization. Therefore, making the category column uniform in terms of capitalization


Checking Duplicate values and dropping them based on 'issn' column.

In [11]:
journals.shape

(105492, 6)

In [0]:
journals.drop_duplicates(subset='issn',keep='first',inplace=True)

In [13]:
journals.shape

(13248, 6)

In [14]:
journals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13248 entries, 0 to 105494
Data columns (total 6 columns):
issn            13248 non-null object
journal_name    13248 non-null object
pub_name        11679 non-null object
is_hybrid       13248 non-null float64
category        6818 non-null object
url             6386 non-null object
dtypes: float64(1), object(5)
memory usage: 724.5+ KB


Replacig the Nan Values in Category to 'Other'

In [15]:
journals['category'].fillna(value='Other',inplace=True)
journals.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 13248 entries, 0 to 105494
Data columns (total 6 columns):
issn            13248 non-null object
journal_name    13248 non-null object
pub_name        11679 non-null object
is_hybrid       13248 non-null float64
category        13248 non-null object
url             6386 non-null object
dtypes: float64(1), object(5)
memory usage: 724.5+ KB


Replacing the Nan values in url to 'No Url'

In [16]:
journals['url'].fillna(value='No Url',inplace=True)
journals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13248 entries, 0 to 105494
Data columns (total 6 columns):
issn            13248 non-null object
journal_name    13248 non-null object
pub_name        11679 non-null object
is_hybrid       13248 non-null float64
category        13248 non-null object
url             13248 non-null object
dtypes: float64(1), object(5)
memory usage: 724.5+ KB


Replacing the Nan in pub name with 'Other'

In [17]:
journals['pub_name'].fillna(value='Other',inplace=True)
journals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13248 entries, 0 to 105494
Data columns (total 6 columns):
issn            13248 non-null object
journal_name    13248 non-null object
pub_name        13248 non-null object
is_hybrid       13248 non-null float64
category        13248 non-null object
url             13248 non-null object
dtypes: float64(1), object(5)
memory usage: 724.5+ KB


Relacing the numeric url with the literal 'No Url' as we know this is invalid.

In [0]:

journals.loc[journals.url.str.isnumeric() == True] = 'No Url'


In [19]:
journals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13248 entries, 0 to 105494
Data columns (total 6 columns):
issn            13248 non-null object
journal_name    13248 non-null object
pub_name        13248 non-null object
is_hybrid       13248 non-null object
category        13248 non-null object
url             13248 non-null object
dtypes: object(6)
memory usage: 724.5+ KB


In [20]:
journals

Unnamed: 0,issn,journal_name,pub_name,is_hybrid,category,url
0,1654-9880,Global Health Action,Co-Action Publishing,0,Medicine,No Url
1,2278-9480,International Journal of Applied Research & St...,Other,0,Other,No Url
2,1980-5918,Fisioterapia Em Movimento,PontifÔŅĹcia Universidade CatÔŅĹlica do ParanÔŅĹ,0,Medicine,http://www.pucpr.br/revfisio
3,1121-760X,European Journal of Histochemistry,Other,0,Other,No Url
4,2319-5754,Species,Discovery Publication,0,Other,No Url
...,...,...,...,...,...,...
105487,00102-0453,anthropologie56,Other,0,Other,No Url
105488,00102-0458,anthropologie61,Other,0,Other,No Url
105492,00102-0455,anthropologie58,Other,0,Other,No Url
105493,00102-0408,anthropologie11,Other,0,Other,No Url


# Estimated Article Influence Scores

In [21]:


estimated = pd.read_csv("estimated-article-influence-scores-2015.csv", encoding = encode_type)

estimated.head()

Unnamed: 0.1,Unnamed: 0,journal_name,issn,citation_count_sum,paper_count_sum,avg_cites_per_paper,proj_ai,proj_ai_year
0,0,3d research,2092-6731,151.0,106.0,1.424528,0.29,2015
1,1,aaps pharmscitech,1530-9932,2208.0,801.0,2.756554,0.665,2015
2,2,abstract and applied analysis,1687-0409,3005.0,2923.0,1.028053,0.192,2015
3,3,academic psychiatry,1545-7230,537.0,490.0,1.095918,0.208,2015
4,4,academic questions,1936-4709,40.0,67.0,0.597015,0.097,2015


In [22]:
estimated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3615 entries, 0 to 3614
Data columns (total 8 columns):
Unnamed: 0             3615 non-null int64
journal_name           3615 non-null object
issn                   3615 non-null object
citation_count_sum     3603 non-null float64
paper_count_sum        3603 non-null float64
avg_cites_per_paper    3603 non-null float64
proj_ai                3603 non-null float64
proj_ai_year           3615 non-null int64
dtypes: float64(4), int64(2), object(2)
memory usage: 226.1+ KB


In [23]:
estimated.shape

(3615, 8)

In [24]:
estimated.describe()

Unnamed: 0.1,Unnamed: 0,citation_count_sum,paper_count_sum,avg_cites_per_paper,proj_ai,proj_ai_year
count,3615.0,3603.0,3603.0,3603.0,3603.0,3615.0
mean,1831.496542,1446.71385,509.671107,2.11897,0.516554,2015.0
std,1056.690131,7996.860704,1923.905084,1.687837,0.572554,0.0
min,0.0,0.0,1.0,0.0,0.0,2015.0
25%,916.5,150.0,107.0,1.054941,0.199,2015.0
50%,1835.0,421.0,239.0,1.717822,0.367,2015.0
75%,2746.5,1097.0,522.0,2.710734,0.6515,2015.0
max,3659.0,426949.0,104705.0,26.318824,11.367,2015.0


Replacing the null values in the column with the mean of the given values

In [0]:
estimated['citation_count_sum'].fillna(value=estimated['citation_count_sum'].mean(),inplace=True)
estimated['paper_count_sum'].fillna(value=estimated['paper_count_sum'].mean(),inplace=True)
estimated['avg_cites_per_paper'].fillna(value=estimated['avg_cites_per_paper'].mean(),inplace=True)
estimated['proj_ai'].fillna(value=estimated['proj_ai'].mean(),inplace=True)



In [26]:
estimated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3615 entries, 0 to 3614
Data columns (total 8 columns):
Unnamed: 0             3615 non-null int64
journal_name           3615 non-null object
issn                   3615 non-null object
citation_count_sum     3615 non-null float64
paper_count_sum        3615 non-null float64
avg_cites_per_paper    3615 non-null float64
proj_ai                3615 non-null float64
proj_ai_year           3615 non-null int64
dtypes: float64(4), int64(2), object(2)
memory usage: 226.1+ KB


In [27]:
len(estimated['issn'].unique())

3615

There seem to be no duplicate for issan column . So there is no required to drop the values.

In [28]:
estimated

Unnamed: 0.1,Unnamed: 0,journal_name,issn,citation_count_sum,paper_count_sum,avg_cites_per_paper,proj_ai,proj_ai_year
0,0,3d research,2092-6731,151.0,106.0,1.424528,0.290,2015
1,1,aaps pharmscitech,1530-9932,2208.0,801.0,2.756554,0.665,2015
2,2,abstract and applied analysis,1687-0409,3005.0,2923.0,1.028053,0.192,2015
3,3,academic psychiatry,1545-7230,537.0,490.0,1.095918,0.208,2015
4,4,academic questions,1936-4709,40.0,67.0,0.597015,0.097,2015
...,...,...,...,...,...,...,...,...
3610,3655,zoologica poloniae,2083-6112,0.0,10.0,0.000000,0.000,2015
3611,3656,zoological studies,1021-5506,381.0,290.0,1.313793,0.262,2015
3612,3657,zoology,0944-2006,472.0,221.0,2.135747,0.483,2015
3613,3658,zoomorphology,1432-234X,191.0,125.0,1.528000,0.317,2015


#Price.csv Cleaning


In [29]:

price = pd.read_csv("price.csv", encoding = encode_type)
price.head()

Unnamed: 0,id,price,date_stamp,journal_id,influence_id,url,license
0,8691,1400.0,2016-08-11,2051-5960,,,
1,8692,2175.0,2016-08-11,1758-9193,,,
2,8693,2145.0,2016-08-11,1476-0711,,,
3,8694,2145.0,2016-08-11,2047-2994,,,
4,8695,2145.0,2016-08-11,1744-9081,,,


In [30]:
price.describe()

Unnamed: 0,id,price,influence_id,license
count,7795.0,7795.0,759.0,378.0
mean,12588.002309,1273.908794,4570.26087,3.989418
std,2250.370998,1264.786225,583.092441,3.716034
min,8691.0,0.0,209.0,1.0
25%,10639.5,0.0,4324.5,1.0
50%,12588.0,1000.0,4571.0,1.0
75%,14536.5,2580.0,4937.5,6.0
max,16486.0,7590.0,5516.0,10.0


In [31]:
price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7795 entries, 0 to 7794
Data columns (total 7 columns):
id              7795 non-null int64
price           7795 non-null float64
date_stamp      7795 non-null object
journal_id      7795 non-null object
influence_id    759 non-null float64
url             364 non-null object
license         378 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 426.4+ KB


Checking the duplicated based on 'journal_id'  values

In [32]:


len(price['journal_id'].unique())

5720

There seem to be the duplicate values in price. Dropping the duplicate values.

In [33]:
price.drop_duplicates(subset='journal_id',keep='first',inplace=True)
price.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5720 entries, 0 to 7793
Data columns (total 7 columns):
id              5720 non-null int64
price           5720 non-null float64
date_stamp      5720 non-null object
journal_id      5720 non-null object
influence_id    290 non-null float64
url             95 non-null object
license         101 non-null float64
dtypes: float64(3), int64(1), object(3)
memory usage: 357.5+ KB


Dropping the columns url,license and influence_id as it is only present in less than 10% of the rows.

In [0]:
price.drop(['influence_id', 'url','license'], axis = 1,inplace=True)

In [35]:
price.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5720 entries, 0 to 7793
Data columns (total 4 columns):
id            5720 non-null int64
price         5720 non-null float64
date_stamp    5720 non-null object
journal_id    5720 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 223.4+ KB


In [36]:
price.describe()

Unnamed: 0,id,price
count,5720.0,5720.0
mean,12375.076224,1261.147559
std,2078.186712,1332.802652
min,8691.0,0.0
25%,10963.75,0.0
50%,12437.5,651.0
75%,14070.25,3000.0
max,16485.0,5000.0


In [37]:
price

Unnamed: 0,id,price,date_stamp,journal_id
0,8691,1400.0,2016-08-11,2051-5960
1,8692,2175.0,2016-08-11,1758-9193
2,8693,2145.0,2016-08-11,1476-0711
3,8694,2145.0,2016-08-11,2047-2994
4,8695,2145.0,2016-08-11,1744-9081
...,...,...,...,...
7789,16481,75.0,2017-09-30,2349-3755
7790,16482,80.0,2017-09-30,2456-1908
7791,16483,85.0,2017-09-30,2350-0530
7792,16484,800.0,2017-10-18,2251-3582


Since 25 % of the price values are 0 , it should be repplaced by mean values or some other values. As no journal should be free. Lets assign the price value to the mean value for the journals which have price 0.


In [0]:
price['price'].replace(0, price['price'].mean(),inplace=True)

In [39]:
price.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5720 entries, 0 to 7793
Data columns (total 4 columns):
id            5720 non-null int64
price         5720 non-null float64
date_stamp    5720 non-null object
journal_id    5720 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 223.4+ KB


In [40]:
price.describe()

Unnamed: 0,id,price
count,5720.0,5720.0
mean,12375.076224,1801.103918
std,2078.186712,896.479416
min,8691.0,1.0
25%,10963.75,1261.147559
50%,12437.5,1261.147559
75%,14070.25,3000.0
max,16485.0,5000.0


Converting the clean data into new csv files.

In [0]:
journals.to_csv('cleaned_journals.csv',index=True) 
price.to_csv('cleaned_price.csv', index=True) 
estimated.to_csv('cleaned_estimated_article.csv',index=True)