In [None]:
%config InlineBackend.figure_format = 'svg'

In [None]:
!which python

In [None]:
!python --version

In [None]:
!pip install pandas seaborn

## 2.2 A quick tour of Python data analysis capabilities

In [None]:
import pandas as pd
sat_df = pd.read_csv('https://bit.ly/PMR-ch2')
sat_df.Segment = sat_df.Segment.astype(pd.api.types.
                                       CategoricalDtype())
sat_df.head()

In [None]:
sat_df.describe()

In [None]:
import seaborn as sns
sns.heatmap(sat_df.corr())

In [None]:
sat_df.groupby('Segment').iProdSAT.mean()

In [None]:
import statsmodels.formula.api as smf
from statsmodels.stats import anova as sms_anova
segment_psat_lm = smf.ols('iProdSAT ~ -1 + Segment',
                          data=sat_df).fit()
sms_anova.anova_lm(segment_psat_lm)

In [None]:
import matplotlib.pyplot as plt
plt.errorbar(y=segment_psat_lm.params.index,
             x=segment_psat_lm.params.values,
             xerr=segment_psat_lm.conf_int()[1].T
                  - segment_psat_lm.params,
             fmt='ko')

## Basics of working with Python commands

In [None]:
x = [1, 23, 6]
print(x)

In [None]:
print(X)

In [None]:
x = [1, 23, 6] # Initialize a list

In [None]:
# Initialize a list
x = [1, 23, 6]
print(x)

## 2.4 Built-in types

### 2.4.1 Booleans

In [None]:
1 == 1

In [None]:
1 < 2

In [None]:
1 == 2

In [None]:
x = True
y = False
x or y

In [None]:
x and y

In [None]:
x and not y

### 2.4.2 Numeric types

int

In [None]:
x = 2
y = 4
x + y

In [None]:
w = x/y
w

In [None]:
type(w)

In [None]:
x ** y

float

In [None]:
z = 3.2
type(x * z)

complex

In [None]:
3j

In [None]:
3j**2

## 2.4.3 Sequence types

list

In [None]:
x = [0, 1, 2, 3, 4, 5]
y = ['a', 'b', 'c']

In [None]:
x + y

In [None]:
x.append('r')
x

In [None]:
len(x)

In [None]:
x[1]

In [None]:
x[2:4]

In [None]:
x[:2]

In [None]:
x[1:]

In [None]:
x[-2:]

In [None]:
x[2] = 'freeze'
x

tuples

In [None]:
z = (7, 8, 9)

In [None]:
z[1]

In [None]:
z[1] = 'boil'

ranges

In [None]:
range(10)

In [None]:
list(range(10))

In [None]:
list(range(2,12))

In [None]:
list(range(2, 12, 2))

### 2.4.4 Text sequence type - str

In [None]:
x = 'Hello'
y = "World"
x+y

In [None]:
x[3:]

In [None]:
x.lower()

In [None]:
x.upper()

In [None]:
x.replace('lo', 'p')

In [None]:
', '.join([x,y, 'what a day!'])

In [None]:
'Hello, World, what a day!'.split(',')

#### Format

In [None]:
temperature = 21.34
'The temperature today is {} degrees'.format(temperature)

In [None]:
x = 18.93
y = 345.234
'{} divided by {} equals {}'.format(x, y, x/y)

In [None]:
'{x} plus {x} plus {y} equals {r}'.format(x=x, y=y, r=x + x + y)

### 2.4.5 Set types

In [None]:
x = [1, 1, 3, 8, 12, 12]
set(x)

In [None]:
3 in x

In [None]:
5 in x

In [None]:
x = set([1, 1, 3, 8, 12, 12])
y = set([2, 2, 8, 9])
x.intersection(y)

In [None]:
y.difference(x)

In [None]:
x.union(y)

### 2.4.6 Mapping types

In [None]:
x = dict(a=1, b=2, c=3)
x

In [None]:
x = {'a': 1, 'b': 2, 'c': 3}
x

In [None]:
y = {1: 'a', 2: 'b', 3: 'c'}
y

In [None]:
x['a']

In [None]:
y.items()

In [None]:
y.keys()

In [None]:
y.values()

### 2.4.7 Functions, classes, and methods

In [None]:
def add(a, b):
  return a + b

In [None]:
add(3, 4)

In [None]:
add(a=3, b=4)

In [None]:
def add(a, b=0):
  return a + b

In [None]:
add(3)

In [None]:
add(3, 4)

In [None]:
class Adder:
  '''A class that adds its parameters'''
  def __init__(self, a, b):
    self.x = a
    self.y = b

  def add(self):
    return self.x + self.y

In [None]:

adder = Adder(3, 4)
adder

In [None]:
adder.x

In [None]:
adder.add()

In [None]:
adder2 = Adder('frog', 'coyote')
adder2.add()

### 2.4.8 Modules and packages

In [None]:
!pip install python_marketing_research

In [None]:
from python_marketing_research_functions import chapter2

chapter2.add(3, 4)

In [None]:
adder = chapter2.Adder(3, 4)
adder.add()

In [None]:
!wget https://raw.githubusercontent.com/python-marketing-research/\
  python-marketing-research-1ed/master/requirements.txt
!pip install -r requirements.txt

### 2.4.9 Control flow

#### If statement

In [None]:
x = 5
if x > 2:
  print('x = {}, which is greater than 2'.format(x))
print('Done!')

In [None]:
x = 0
if x > 2:
  print('x = {}, which is greater than 2'.format(x))
print('Done!')

In [None]:
x = 0
if x > 2:
  print('x = {}, which is greater than 2'.format(x))
else:
  print('x = {}, which is less than or equal to 2'.format(x))

In [None]:
x = 2
if x > 2:
  print('x = {}, which is greater than 2'.format(x))
elif x == 2:
  print('x = {}, which equals 2!'.format(x))
else:
  print('x = {}, which is less than 2'.format(x))

In [None]:
def check_present(value, values):
  if value in values:
    print('{} was found in the values'.format(value))
  else:
    print('{} was NOT found in the values'.format(value))

In [None]:
a = set([4, 2, 5, 1, 12, 33])
check_present(38, a)
check_present(12, a)

#### While statement

In [None]:
x = 0
while x < 5:
  print(x)
  x += 1

In [None]:
a = [4, 2, 5, 1, 12, 33]
a_squared = []
i = 0
while i < len(a):
  a_squared.append(a[i]**2)
  i += 1
print('a_squared generated: {}'.format(a_squared))

#### For statements

In [None]:
a = [4, 2, 5, 1, 12, 33]
a_squared = []
for x in a:
  a_squared.append(x**2)
print('a_squared generated: {}'.format(a_squared))

In [None]:
for i in range(5):
  print(i)

In [None]:
a_squared = []
for i in range(len(a)):
  a_squared.append(a[i]**2)
print('a_squared generated: {}'.format(a_squared))

In [None]:
for i in range(21, 100, 12):
  print(i)

In [None]:
for x, y in zip(range(6), range(6, 18, 2)):
  print(x,y)

In [None]:
for j, k in zip(range(6), range(6, 12, 2)):
  print(j, k)

In [None]:
for i, x in enumerate(a):
  print(i, x)

In [None]:
x = 34
y = x-1
while True:
  if x % y == 0:
    break
  y -= 1
print('{y} is the largest factor of {x},\n{f2} times {y} equals {x}'
      .format(y=y, x=x, f2=x/y))

#### List comprehension

In [None]:
a = [4, 2, 5, 1, 12, 33]

In [None]:
a_plus_one = [x + 1 for x in a]
a_plus_one

In [None]:
a_plus_one_filtered = [x + 1 for x in a if x < 12]
a_plus_one_filtered

In [None]:
a_modified = [x + 1 if x < 12 else x * 100 for x in a]
a_modified

In [None]:
a_square_tuples = [(v, v**2) for v in a]
a_square_tuples

In [None]:
a_reconstructed = [w/v for v,w in a_square_tuples]
a_reconstructed

In [None]:
a_square_dict = {v: v**2 for v in a}
a_square_dict

### Help!

In [None]:
help(range)

In [None]:
help(Adder)

In [None]:
?range

### 2.5.1 NumPy

#### Array

In [None]:
import numpy as np

In [None]:
x = np.array([1, 3, 4])
print(x)
print(x.dtype)

In [None]:
x = np.array([1, 3, 4, 'a', 'b'])
print(x)
print(x.dtype)

In [None]:
a = [7, 4, 2, 22, -12]
a[:3]

In [None]:
x = np.array(a)
x[:3]

In [None]:
x[[0, 3]]

In [None]:
#a[[0, 3]]

In [None]:
x[[True, False, False, True, False]]

In [None]:
A = [[1, 2, 3],
    [4, 5, 6]]
A

In [None]:
A[0]

In [None]:
A[1][2]

In [None]:
aA = np.array(A)
aA[0, :]

In [None]:
aA[:, 1]

In [None]:
aA[1, 2]

In [None]:
x = np.empty(shape=5, dtype=np.int32)
x

In [None]:
x = np.zeros(shape=5)
x

#### Vectorized operations

In [None]:
a = [7, 4, 2, 22, -12]
a_squared = []
for v in a:
  a_squared.append(v**2)
print(a_squared)

In [None]:
[v**2 for v in a]

In [None]:
x = np.array(a)
x**2

In [None]:
x > 5

In [None]:
(x > 5).sum()

In [None]:
(x > 5).sum()/len(x)

In [None]:
x[x > 5]

In [None]:
y = np.array([34, 2, 9, -5, -18])
x / y

### Pandas

In [None]:
import pandas as pd

#### Series

In [None]:
a = [7, 4, 2, 22, -12]
x = pd.Series(a)
x

In [None]:
x.index

In [None]:
x.values

In [None]:
x = pd.Series(a, index=['a', 'b', 'c', 'd', 'e'])
x['a']

In [None]:
x[['b', 'd', 'e']]

In [None]:
start_time = pd.datetime.strptime('2019-04-09', '%Y-%m-%d')
x = pd.Series(a,
              index=pd.date_range(start=start_time,
                                  normalize=True, periods=5)
             )
x

In [None]:
x + 5

In [None]:
x**2

In [None]:
x.abs() > 10

#### Data Frames

In [None]:
a = [7, 4, 2, 22, -12]
b = [34, 2, 9, -5, -18]
ab_df = pd.DataFrame({'a': a, 'b': b})
ab_df

In [None]:
ab_df = pd.DataFrame({'a': a, 'b': b},
                     index=pd.date_range(start=start_time,
                                  normalize=True, periods=5))
ab_df

In [None]:
ab_df['a']

In [None]:
ab_df.a

In [None]:
ab_df.loc['2019-04-10']

In [None]:
ab_df.iloc[3]

In [None]:
ab_df.iloc[:, 1]

In [None]:
ab_df.head(3)

In [None]:
ab_df.mean()

In [None]:
ab_df.mean(axis=0)

In [None]:
ab_df.mean(axis=1)

In [None]:
store_id = pd.Series([3, 14, 21, 32, 54],dtype='category') # store id
store_rev = [543, 654, 345, 678, 234] # store revenue, $1000
store_visits = [45, 78, 32, 56, 34] # visits, 1000s
store_manager = ['Annie', 'Bert', 'Carla', 'Dave', 'Ella']
store_df = pd.DataFrame({'id': store_id,
                         'rev': store_rev,
                         'visits': store_visits,
                         'manager': store_manager})
store_df

In [None]:
store_df.manager

In [None]:
store_df.rev.mean()

In [None]:
store_df.corr()

In [None]:
store_df.describe()

### Missing data

In [None]:
np.nan

In [None]:
type(np.nan)

In [None]:
x = np.array([3., 4., 6., 2., np.nan, 18., np.nan])
x

In [None]:
np.mean(x)

In [None]:
np.nanmean(x)

In [None]:
np.isnan(x)

In [None]:
~np.isnan(x)

In [None]:
np.mean(x[~np.isnan(x)])

In [None]:
x = np.array([3., 4., 6., 2., -999, 18., -999])
x.mean()

In [None]:
x[x == -999] = np.nan
np.nanmean(x)

In [None]:
np.log(np.array([-1, 0, 1]))

## Loading and saving data

### Pickle files

In [None]:
import pickle

f = open('store_df.pkl', 'wb')
pickle.dump(store_df, f)
f.close()

In [None]:
help(open)

In [None]:
with open('store_df.pkl', 'wb') as f:
  pickle.dump(store_df, f)

In [None]:
with open('store_df.pkl', 'rb') as f:
  store_df_reload = pickle.load(f)
store_df_reload

In [None]:
store_df.to_pickle('store_df.pkl')

In [None]:
store_df_reload2 = pd.read_pickle('store_df.pkl')
store_df_reload2

### Importing data

In [None]:
store_df.to_csv()

In [None]:
store_df.to_csv('store_df.csv', index=False)

In [None]:
store_df_from_csv = pd.read_csv('store_df.csv')
store_df_from_csv.id = store_df_from_csv.id.astype('category')
store_df_from_csv

In [None]:
store_df == store_df_from_csv

### Importing data Colab

In [None]:
from google.colab import files

files.download('store_df.csv')

In [None]:
files.upload()

In [None]:
!ls

In [None]:
store_df_uploaded = pd.read_csv('store_df (1).csv')

In [None]:
store_df_uploaded == store_df

In [None]:
del store_df_uploaded, store_df