In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.insert(0, "../code-previous")
sys.path.insert(0, "..")

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import sklearn.feature_extraction as skfeatures
import utils
import time
import os

import nltk
stopwords = nltk.corpus.stopwords.words('english')

from unicodedata import category
from tqdm import tqdm
tqdm.pandas(desc="Progress")

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt


path2rawdata = '/mnt/disks/vault/wos2017-parsed/'

In [4]:
raw_data_path = "/mnt/disks/vault/analysis-data/raw_data_full/raw_data_full.pql"
data = pd.read_pickle(raw_data_path)

In [5]:
# New fields to dataframe 
data['log_c5'] = data['c5'].progress_apply(lambda x: np.log(x+1))
data['character_count'] = data['Title'].progress_apply(lambda x: len(x))
data["word_count"] = data["Title"].progress_apply(lambda x: len(x.split()))
data['title_without_punct'] = data['Title'].progress_apply(lambda x: 
                                                           str.lower(''.join(ch for ch in str(x) if category(ch)[0] != 'P')))
data["title_without_stopwords"] = data['title_without_punct'].progress_apply(lambda x: 
                                                                            " ".join([word for word in x.split() if word not in stopwords]))


data["word_count"] = data["Title"].progress_apply(lambda x: len(x.split()))
data["cleaned_title_word_count"] = data["title_without_stopwords"].progress_apply(lambda x: len(x.split()))

Progress: 100%|██████████| 738469/738469 [00:01<00:00, 484356.58it/s]
Progress: 100%|██████████| 738469/738469 [00:00<00:00, 865778.32it/s] 
Progress: 100%|██████████| 738469/738469 [00:01<00:00, 561111.19it/s]
Progress: 100%|██████████| 738469/738469 [00:12<00:00, 58078.48it/s]
Progress: 100%|██████████| 738469/738469 [00:15<00:00, 49222.75it/s]
Progress: 100%|██████████| 738469/738469 [00:01<00:00, 558182.17it/s]
Progress: 100%|██████████| 738469/738469 [00:01<00:00, 615515.30it/s]


In [6]:
grouped_titles = data.groupby('title_without_punct').size().reset_index(name='times_repeated')
select_titles = grouped_titles[grouped_titles.times_repeated >= 2]

_data = select_titles.merge(data,
                           how = 'left',
                           left_on = 'title_without_punct',
                           right_on = 'title_without_punct')

_data = _data[['Title','PubYear_x', 'Journal', 'c5', 'log_c5', 'times_repeated']]
_data = _data.sort_values(by=['times_repeated','Title'], ascending=False)

print("Number of unique titles which are reprated: ", len(_data['Title'].unique()))

Number of unique titles which are reprated:  1776


Questions:
    
1. Do they all appear in the same journal
2. If they are not from the same journal, how is their impact varying by year / journal
3. How has their citations been varying over the years

Many titles are repeated in the journal: PHILOSOPHICAL MAGAZINE

Article Names to check upon:
    
    CORRECTION, SILICON MOLECULAR-BEAM EPITAXY

In [7]:
# remove the articles from philosophical magazine - with the same name and year
print("Removing philosophical magazine")
_data_filtered = _data[_data.Journal != 'PHILOSOPHICAL MAGAZINE']
print("Count: ", len(_data_filtered))

print("Grouping articles by name, journal and year")
_data_filtered = _data_filtered.groupby([_data_filtered.Title, _data_filtered.Journal, _data.PubYear_x]).size().reset_index(name='times_repeated')
print("Count: ", len(_data_filtered))

# group by titles and see if they have appeared in different journals
_data_filtered_update = _data_filtered.groupby(["Title"])["Journal"].count().reset_index(name='times_repeated')
titles_to_look = list(_data_filtered_update['Title'])
print("Filtered Count: ", _data_filtered.count())


_data_filtered = _data[_data.Title.isin(titles_to_look)]
print("Updated Count: ", len(_data_filtered))

Removing philosophical magazine
Count:  2517
Grouping articles by name, journal and year
Count:  2001
Filtered Count:  Title             2001
Journal           2001
PubYear_x         2001
times_repeated    2001
dtype: int64
Updated Count:  2540


In [8]:
_data_filtered = _data_filtered[_data_filtered.Title != "untitled"]
_data_filtered = _data_filtered[_data_filtered.Title != "UNTITLED"]
_data_filtered = _data_filtered[_data_filtered.Title != "Untitled"]

_data_filtered.head(20)

Unnamed: 0,Title,PubYear_x,Journal,c5,log_c5,times_repeated
510,CORRECTION,1987,JOURNAL OF VACUUM SCIENCE & TECHNOLOGY A,0.0,0.0,13
511,CORRECTION,1990,PHYSICA STATUS SOLIDI A-APPLIED RESEARCH,0.0,0.0,13
512,CORRECTION,1985,PHYSICA STATUS SOLIDI A-APPLIED RESEARCH,1.0,0.693147,13
513,CORRECTION,1985,JAPANESE JOURNAL OF APPLIED PHYSICS PART 2-LET...,0.0,0.0,13
514,CORRECTION,1955,REVIEW OF SCIENTIFIC INSTRUMENTS,1.0,0.693147,13
515,CORRECTION,1957,JOURNAL OF APPLIED PHYSICS,0.0,0.0,13
516,CORRECTION,1957,JOURNAL OF APPLIED PHYSICS,1.0,0.693147,13
517,CORRECTION,1950,REVIEW OF SCIENTIFIC INSTRUMENTS,0.0,0.0,13
518,CORRECTION,1953,BRITISH JOURNAL OF APPLIED PHYSICS,0.0,0.0,13
519,CORRECTION,1965,INFRARED PHYSICS,0.0,0.0,13


In [16]:
pd.set_option('display.max_columns', None)  


In [26]:
# CORRECTION / BIBILIOGRAPHY / SOLID-STATE BIBLIOGRAPHY / NOVEMBER 1994 /NOVEMBER 1994 / To the editor /

# Most titles with same name are from same journals, same year

# Good titles: X-RAY-LITHOGRAPHY, CURRENT DENSITIES OF FREE-MOVING CATHODE SPOTS, NEW HIGH-PERFORMANCE DISKS FOR TURBOMOLECULAR...
# MECHANICAL MODELS FOR THE REPRESENTATION OF VI.., INVERSE PHOTOEMISSION, CRYOPUMPING

# Interesting: 
# EQUILIBRIUM SPACE-CHARGE DISTRIBUTIONS IN SEMI.., 
# ZONE REFINING OF ANTHRACENE, 
# Variation of modal dispersion and bandwidth with temperature in PMMA based step-index polymer optical fibers
# Van Hove singularities in intersubband transitions in multiquantum well photodetectors
# VOLUME AND SURFACE SELF-DIFFUSION MEASUREMENTS ON COPPER BY THERMAL SURFACE SMOOTHING
# VIBRATING WIRE VISCOMETER, VARIATION OF CONCENTRATION WITH DEPTH OF ABSORBED OXYGEN IN NIOBIUM DURING OXIDATION,
# VACUUM ULTRAVIOLET EMISSION AND H- PRODUCTION IN A LOW-PRESSURE HYDROGEN PLASMA
# Two laterally arranged quantum dot systems with strong capacitive interdot coupling
# Total and negative refraction of electromagnetic waves
# Top-gated graphene field-effect-transistors formed by decomposition of SiC
# Thickness dependence of structural, optical and electrical properties of ZnO : Al films prepared on flexible substrates
# Thermal stability of ohmic contacts to InN
# The use and abuse of the terms percent, parts per million and parts in 10(n)
# The stability range of lead oxide compounds in BSCCO-2223 precursor powders
# The investigation of dark current reduction in MSM photodetector based on porous GaN
# The effects of buoyancy convection on the measured solute diffusion coefficients in dilute metallic liquids


_data_filtered[340:370]

Unnamed: 0,Title,PubYear_x,Journal,c5,log_c5,times_repeated
2774,The cold quality Q(cold) for magnicon on resonator in the rotating TMn10 mode,1997,INTERNATIONAL JOURNAL OF INFRARED AND MILLIMETER WAVES,0.0,0.0,2
2775,The cold quality Q(cold) for magnicon on resonator in the rotating TM(n10) mode,1997,INTERNATIONAL JOURNAL OF INFRARED AND MILLIMETER WAVES,0.0,0.0,2
3039,The U. S. Naval Research Laboratory,1939,JOURNAL OF APPLIED PHYSICS,0.0,0.0,2
3040,The U. S. Naval Research Laboratory,1944,JOURNAL OF APPLIED PHYSICS,0.0,0.0,2
2723,Terahertz time-domain spectroscopy,2005,TERAHERTZ OPTOELECTRONICS,7.0,2.079442,2
2724,Terahertz time-domain spectroscopy,1998,MILLIMETER AND SUBMILLIMETER WAVE SPECTROSCOPY OF SOLIDS,13.0,2.639057,2
2715,Temperature scaling of the flux pinning force for SmBa2Cu3O7-x single crystal,2005,MODERN PHYSICS LETTERS B,0.0,0.0,2
2716,Temperature scaling of the flux pinning force for SmBa2Cu3O7-x single crystal,2005,INTERNATIONAL JOURNAL OF MODERN PHYSICS B,0.0,0.0,2
2709,Temperature influence on the dynamics of vertically aligned liquid crystal displays,2008,APPLIED PHYSICS LETTERS,0.0,0.0,2
2710,Temperature influence on the dynamics of vertically aligned liquid crystal displays,2008,APPLIED PHYSICS LETTERS,7.0,2.079442,2


In [27]:
# for every "title_without_punct" how many unique "journal" are present?
# grouped_titles = data.groupby('title_without_punct')['Journal'].size().reset_index(name='times_repeated')
# grouped_titles = grouped_titles[grouped_titles.times_repeated > 1]
# grouped_titles

In [None]:
# distance between every article titles
