In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Load Data

### Visualize files inside zip folder

In [None]:
import os
print(os.listdir("../input/tabs_bra"))

### Load first .csv file

In [None]:
df = pd.read_csv('../input/tabs_bra/documents_counts.csv')

# Understand Data

### Get information about columns and rows

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

### At this point we already know that there is something weird with max number of authors, pages and references. Let's have a look at some of the string type of data. 

### How many thematic areas are there in the table?

In [None]:
df['title thematic areas'].nunique()

### How many unique journals/publications are in the table?

In [None]:
df['title at SciELO'].nunique()

### What are the categories of current status for each journal/publication?

In [None]:
df['title current status'].unique()

### How many document types?

In [None]:
df['document type'].nunique()

### How many different years of publication are included?

In [None]:
df['document publishing year'].nunique()

This means that the range is too large. We probably aren't interested in data about papers published 100 years ago.

### How many unique extraction dates are there?

In [None]:
df['extraction date'].nunique()

This is also disappointing since it means that the entire column is useless. There's only a single entry in that column. All data from the table was extracted on the same day. 

# Data Exploration through Visualization

### Use seaborn to create a countplot by thematic area

In [None]:
plt.figure(figsize=(6,10))
plt.rc('axes', labelsize=12) 
sns.countplot(y='title thematic areas',hue='title current status',data=df,palette='viridis', dodge=False)

### Use seaborn to create a countplot by document type

In [None]:
plt.figure(figsize=(10,6))
plt.rc('axes', labelsize=16) 
sns.countplot(y='document type',hue='title current status',data=df,palette='viridis', dodge=False)

### Use seaborn to create a countplot by year of publication starting in 1970.

In [None]:
plt.figure(figsize=(18,8))
plt.xticks(rotation=45)
plt.rc('axes', labelsize=22) 
sns.countplot(x='document publishing year',data=df[df['document publishing year']>=1970],hue='title current status',palette='viridis', dodge=False)

In [None]:
sns.pairplot(df[(df['authors']<15) & (df['pages']<50)][['authors','pages','references']])

In [None]:
sns.distplot(df[df['document type']=='research-article']['pages'])

In [None]:
sns.distplot(df[df['authors']<15]['authors'])