# Exploring Datasets with Numpy and Pandas

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np

## Reading the csv file which is our current dataset

In [6]:
df_0 = pd.read_csv("datasets.csv")

### The .head() method provides the first 5 values (by default) or any as specified

In [9]:
df_0.head(6)

Unnamed: 0,datasetName,about,link,categoryName,cloud,vintage
0,Microbiome Project,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,GloBI,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,CommonCraw 2012,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,Indiana Webclicks,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,
5,Criteo click-through,Criteo click-through data,http://labs.criteo.com/2015/03/criteo-releases...,Computer Networks,,


### The .tail() method provides the last 5 values (by default) or as specified

In [10]:
df_0.tail(6)

Unnamed: 0,datasetName,about,link,categoryName,cloud,vintage
55,ClueWeb09 FACC,ClueWeb09 FACC,http://lemurproject.org/clueweb09/FACC1/,Natural Language,,2009.0
56,ClueWeb12 FACC,ClueWeb12 FACC,http://lemurproject.org/clueweb12/FACC1/,Natural Language,,2012.0
57,Google Ngrams,Google Books Ngrams (2.2TB),https://aws.amazon.com/datasets/google-books-n...,Natural Language,Amazon,
58,EDRM Enron,"EDRM Enron EMail of 151 users, hosted on S3",https://aws.amazon.com/datasets/enron-email-data/,Social Networks,Amazon,
59,GetGlue,GetGlue - users rating TV shows,http://getglue-data.s3.amazonaws.com/getglue_s...,Social Networks,,
60,Twitter RepLab,Twitter Data for Online Reputation Management,http://nlp.uned.es/replab2013/,Social Networks,,2013.0


## Checking types of data present, we check .dtype()

In [13]:
df_0.dtypes

datasetName      object
about            object
link             object
categoryName     object
cloud            object
vintage         float64
dtype: object

## Dropping the not-needed columns with .drop(["name of the column"])

In [15]:
df_1 = df_0.drop('datasetName',axis=1)
df_1.head()

Unnamed: 0,about,link,categoryName,cloud,vintage
0,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,


## Renaming columns

In [21]:
df_0 = df_0.rename(columns={"about": "CompanyName"})
df_0.head()


Unnamed: 0,datasetName,CompanyName,link,categoryName,cloud,vintage
0,Microbiome Project,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,GloBI,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,CommonCraw 2012,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,Indiana Webclicks,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,


## To get descriptive information about the Dataset, we use .info()

In [22]:
df_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   datasetName   61 non-null     object 
 1   CompanyName   61 non-null     object 
 2   link          61 non-null     object 
 3   categoryName  61 non-null     object 
 4   cloud         24 non-null     object 
 5   vintage       31 non-null     float64
dtypes: float64(1), object(5)
memory usage: 3.0+ KB


## To get the statistical values, we need the .describe() method

In [23]:
df_0.describe()

Unnamed: 0,vintage
count,31.0
mean,2000.677419
std,24.43684
min,1920.0
25%,2006.5
50%,2009.0
75%,2012.0
max,2016.0


#### (Note): To access the size of the dataset quickly, we can use the .shape method

In [25]:
df_0.shape

(61, 6)

In [29]:
df_rows_duplicated = df_0[df_0.duplicated()]
print(df_rows_duplicated)


Empty DataFrame
Columns: [datasetName, CompanyName, link, categoryName, cloud, vintage]
Index: []


In [30]:
df_0.count()

datasetName     61
CompanyName     61
link            61
categoryName    61
cloud           24
vintage         31
dtype: int64

In [32]:
count = df_0.drop_duplicates()
print(count)
df_0.head()

           datasetName                                      CompanyName  \
0   Microbiome Project                American Gut (Microbiome Project)   
1                GloBI               Global Biotic Interactions (GloBI)   
2       Global Climate                   Global Climate Data Since 1929   
3      CommonCraw 2012              3.5B Web Pages from CommonCraw 2012   
4    Indiana Webclicks  53.5B Web clicks of 100K users in Indiana Univ.   
..                 ...                                              ...   
56      ClueWeb12 FACC                                   ClueWeb12 FACC   
57       Google Ngrams                      Google Books Ngrams (2.2TB)   
58          EDRM Enron      EDRM Enron EMail of 151 users, hosted on S3   
59             GetGlue                  GetGlue - users rating TV shows   
60      Twitter RepLab    Twitter Data for Online Reputation Management   

                                                 link       categoryName  \
0             https://g

Unnamed: 0,datasetName,CompanyName,link,categoryName,cloud,vintage
0,Microbiome Project,American Gut (Microbiome Project),https://github.com/biocore/American-Gut,Biology,GitHub,
1,GloBI,Global Biotic Interactions (GloBI),https://github.com/jhpoelen/eol-globi-data/wik...,Biology,GitHub,
2,Global Climate,Global Climate Data Since 1929,http://en.tutiempo.net/climate,Climate/Weather,,1929.0
3,CommonCraw 2012,3.5B Web Pages from CommonCraw 2012,http://www.bigdatanews.com/profiles/blogs/big-...,Computer Networks,,2012.0
4,Indiana Webclicks,53.5B Web clicks of 100K users in Indiana Univ.,http://cnets.indiana.edu/groups/nan/webtraffic...,Computer Networks,,


## .isnull() checks for the null value sin each column, .sum() is used to get the total value of the isnull results

In [33]:
df_2 = df_0
print(df_2.isnull().sum())

datasetName      0
CompanyName      0
link             0
categoryName     0
cloud           37
vintage         30
dtype: int64


## .dropna() method is used to drop rows which have null/NaN values in any of their columns

In [34]:
df_2 = df_2.dropna()
df_2.shape

(2, 6)

In [35]:
df_2.head()

Unnamed: 0,datasetName,CompanyName,link,categoryName,cloud,vintage
32,FBI Hate Crime 2013,FBI Hate Crime 2013 - aggregated data,https://github.com/emorisse/FBI-Hate-Crime-Sta...,Social Sciences,GitHub,2013.0
41,NYC Uber,NYC Uber trip data April 2014 to September 2014,https://github.com/fivethirtyeight/uber-tlc-fo...,Transportation,GitHub,2014.0


In [36]:
print(df_2.isnull().sum())

datasetName     0
CompanyName     0
link            0
categoryName    0
cloud           0
vintage         0
dtype: int64
