In [1]:
import collections
import math
import pandas as pd
print 'Pandas Version ' + pd.__version__

Pandas Version 0.20.2


In [2]:
# Functions that we are going to use to build in extra features
def getLength(string):
    return len(string)

def getEntropy(string):
    p, lns = collections.Counter(string), float(len(string))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

In [3]:
# Read the raw CSV file into a dataframe
df = pd.read_csv('simple_domains.csv', names=['rank','uri'], header=None, encoding='utf-8')
# Look at the top of the dataframe
df.head(10)

Unnamed: 0,rank,uri
0,1,www.facebook.com.
1,2,www.ibm.com.
2,3,www.twitter.com.
3,4,www.amazon.co.uk.
4,5,www.microsoft.com.
5,6,www.yahoo.com
6,7,www.x100-19735784.facebook.xw993je8h78ruhe8fuh...
7,8,www.boi.ie.
8,9,www.itb.ie.
9,10,www.bbc.co.uk.


In [9]:
# Describe the high-level stats
df.describe()

Unnamed: 0,length,entropy
count,10.0,10.0
mean,18.0,2.950747
std,11.87902,0.561434
min,11.0,2.413088
25%,12.25,2.614387
50%,15.0,2.860356
75%,17.0,3.123509
max,51.0,4.336367


In [4]:
# Add some new features/columns to the dataframe

# Left side of '=' is the new column and right side of '=' is the calculated value that will get added to each row

# Add the length feature as a column
df['length'] = [getLength(uri) for uri in df['uri']]

# Add an Entropy calculation as a column
df['entropy'] = [getEntropy(uri) for uri in df['uri']]

In [5]:
# Show the dataframe again with the new columns
df.head(10)

Unnamed: 0,rank,uri,length,entropy
0,1,www.facebook.com.,17,3.130718
1,2,www.ibm.com.,12,2.625815
2,3,www.twitter.com.,16,2.905639
3,4,www.amazon.co.uk.,17,3.101881
4,5,www.microsoft.com.,18,3.155222
5,6,www.yahoo.com,13,2.815072
6,7,www.x100-19735784.facebook.xw993je8h78ruhe8fuh...,51,4.336367
7,8,www.boi.ie.,11,2.413088
8,9,www.itb.ie.,11,2.413088
9,10,www.bbc.co.uk.,14,2.610577


In [6]:
# Delete the rank column
del df['rank']

In [7]:
# Show the dataframe again with the new columns
df.head(10)

Unnamed: 0,uri,length,entropy
0,www.facebook.com.,17,3.130718
1,www.ibm.com.,12,2.625815
2,www.twitter.com.,16,2.905639
3,www.amazon.co.uk.,17,3.101881
4,www.microsoft.com.,18,3.155222
5,www.yahoo.com,13,2.815072
6,www.x100-19735784.facebook.xw993je8h78ruhe8fuh...,51,4.336367
7,www.boi.ie.,11,2.413088
8,www.itb.ie.,11,2.413088
9,www.bbc.co.uk.,14,2.610577


In [18]:
# 10 rows of data in the dataset so lets take a sample of 5 rows
sample = df.sample(n=5, replace=False)
# Print the head of the sample
sample.head()

Unnamed: 0,uri,length,entropy
8,www.itb.ie.,11,2.413088
7,www.boi.ie.,11,2.413088
0,www.facebook.com.,17,3.130718
4,www.microsoft.com.,18,3.155222
2,www.twitter.com.,16,2.905639


In [19]:
# 10 rows of data in the dataset so lets take 50% of the rows
sample = df.sample(frac=0.5, replace=False)
# Print the head of the sample
sample.head()

Unnamed: 0,uri,length,entropy
1,www.ibm.com.,12,2.625815
0,www.facebook.com.,17,3.130718
2,www.twitter.com.,16,2.905639
7,www.boi.ie.,11,2.413088
5,www.yahoo.com,13,2.815072


In [45]:
# Lets take a sample of rows where the domain name contains 'facebook'
# NOTE: there are only two rows that contains facebook so replace=False
# will not work if we look for more than 2 samples.
# We could set replace=True and we would then get duplicates.
sample = df[(df.uri.str.contains('facebook'))].sample(n=2,replace=False)
sample.head()

# Get 5 instead of two
# sample = df[(df.uri.str.contains('facebook'))].sample(n=5,replace=True)
# sample.head()

Unnamed: 0,uri,length,entropy
0,www.facebook.com.,17,3.130718
6,www.x100-19735784.facebook.xw993je8h78ruhe8fuh...,51,4.336367
0,www.facebook.com.,17,3.130718
0,www.facebook.com.,17,3.130718
0,www.facebook.com.,17,3.130718


In [8]:
# Save the dataframe to a new CSV file
df.to_csv('processed_domains.csv',header=False, index=False)

In [None]:
# Save the sampled data to a new CSV file
df.to_csv('sampled_domains.csv',header=False, index=False)