In [176]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import requests

In [177]:
%matplotlib inline

In [178]:
#assign url variable where we will pull data from
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/internet_ads/ad.data"

In [179]:
#pull down the csv info into a pandas dataframe
ad_df = pd.read_csv(url, header=None, dtype=None)

  interactivity=interactivity, compiler=compiler, result=result)


In [180]:
ad_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [205]:
ad_df.shape

(3279, 1559)

In [181]:
#pull in the names.csv info into a pandas dataframe
ad_names = pd.read_csv('~/Downloads/column_names.csv', header=None)

In [183]:
ad_names.head()

Unnamed: 0,0
0,height: continuous.
1,width: continuous.
2,aratio: continuous.
3,"local: 0,1."
4,"url*images+buttons: 0,1."


In [185]:
#add a column name
column_name = ["attributes"]

In [186]:
#assign the column name
ad_names.columns = column_name

In [187]:
#clean up the values in the column
def clean_attr(row):
    return str(row.replace("*","_").replace(".","").replace("0","").replace("1","").replace(",","").replace(":","").strip())
ad_names['attributes'] = ad_names['attributes'].apply(clean_attr)

In [189]:
ad_names.head()

Unnamed: 0,attributes
0,height continuous
1,width continuous
2,aratio continuous
3,local
4,url_images+buttons


In [190]:
#create a column name list
ad_columns = ad_names["attributes"].tolist()

In [None]:
#add column names to ad dataframe
ad_df.columns = ad_columns

In [204]:
ad_df.shape

(3279, 1559)

In [192]:
#expanded data view beyond .head()
ad_df[:25]

Unnamed: 0,height continuous,width continuous,aratio continuous,local,url_images+buttons,url_likesbookscom,url_wwwslakecom,url_hydrogeologist,url_oso,url_media,...,caption_home,caption_my,caption_your,caption_in,caption_bytes,caption_here,caption_click,caption_for,caption_you,ad nonad
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
5,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
6,59,460,7.7966,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
7,60,234,3.9,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
8,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
9,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [193]:
#check data types
ad_df.dtypes

height continuous            object
width continuous             object
aratio continuous            object
local                        object
url_images+buttons            int64
url_likesbookscom             int64
url_wwwslakecom               int64
url_hydrogeologist            int64
url_oso                       int64
url_media                     int64
url_peace+images              int64
url_blipverts                 int64
url_tkaine+kats               int64
url_labyrinth                 int64
url_advertising+blipverts     int64
url_images+oso                int64
url_area5+corridor            int64
url_ran+gifs                  int64
url_express-scriptscom        int64
url_off                       int64
url_cnet                      int64
url_time+998                  int64
url_josefina3                 int64
url_truluckcom                int64
url_clawnext+gif              int64
url_autopencom                int64
url_tvgencom                  int64
url_pixs                    

In [194]:
#count data types
ad_df.dtypes.value_counts()

int64     1554
object       5
dtype: int64

In [195]:
#check why "local" is an object data type as it does not contain any "?" in the expanded view above
print(ad_df.loc[:,"local"].unique()) 

['1' '0' '?' 1 0]


In [196]:
# Coerce to numeric and impute medians for height continuous column
ad_df.loc[:, "height continuous"] = pd.to_numeric(ad_df.loc[:, "height continuous"], errors='coerce')
HasNan = np.isnan(ad_df.loc[:,"height continuous"])
ad_df.loc[HasNan, "height continuous"] = np.nanmedian(ad_df.loc[:,"height continuous"])

In [197]:
# Coerce to numeric and impute medians for width continuous column
ad_df.loc[:, "width continuous"] = pd.to_numeric(ad_df.loc[:, "width continuous"], errors='coerce')
HasNan = np.isnan(ad_df.loc[:,"width continuous"])
ad_df.loc[HasNan, "width continuous"] = np.nanmedian(ad_df.loc[:,"width continuous"])

In [198]:
# Coerce to numeric and impute medians for aratio continuous column
ad_df.loc[:, "aratio continuous"] = pd.to_numeric(ad_df.loc[:, "aratio continuous"], errors='coerce')
HasNan = np.isnan(ad_df.loc[:,"aratio continuous"])
ad_df.loc[HasNan, "aratio continuous"] = np.nanmedian(ad_df.loc[:,"aratio continuous"])

In [199]:
# Coerce to numeric and impute medians for local column
ad_df.loc[:, "local"] = pd.to_numeric(ad_df.loc[:, "local"], errors='coerce')
HasNan = np.isnan(ad_df.loc[:,"local"])
ad_df.loc[HasNan, "local"] = np.nanmedian(ad_df.loc[:,"local"])

In [200]:
#check data type counts
ad_df.dtypes.value_counts()

int64      1554
float64       4
object        1
dtype: int64

In [202]:
#remove . from "ad nonad" column
ad_df["ad nonad"] = ad_df["ad nonad"].map(lambda x: str(x)[:-1])

In [203]:
ad_df.head()

Unnamed: 0,height continuous,width continuous,aratio continuous,local,url_images+buttons,url_likesbookscom,url_wwwslakecom,url_hydrogeologist,url_oso,url_media,...,caption_home,caption_my,caption_your,caption_in,caption_bytes,caption_here,caption_click,caption_for,caption_you,ad nonad
0,125.0,125.0,1.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad
1,57.0,468.0,8.2105,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad
2,33.0,230.0,6.9696,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad
3,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad
4,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad
