In [1]:
import pandas as pd
import numpy as np

In [2]:
data_list = [[1,2,3], [4,5,6]]

In [7]:
df = pd.DataFrame(data_list, columns=['col1', 'col2', 'col3'], dtype=np.int8())
df

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col1    2 non-null      int8 
 1   col2    2 non-null      int8 
 2   col3    2 non-null      int8 
dtypes: int8(3)
memory usage: 134.0 bytes


In [9]:
data_dict = {'col1': [1, 2], 'col2': [3, 4]}
df_1 = pd.DataFrame(data_dict)
df_1

Unnamed: 0,col1,col2
0,1,3
1,2,4


### Pandas wrangling
Ref: [Advanced pandas](https://www.tomasbeuzen.com/python-programming-for-data-science/practice-exercises/chapter9-wrangling-advanced-practice.html)

In [48]:
df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-14/passwords.csv', usecols=['password', 'value', 'time_unit'])
df.head()

Unnamed: 0,password,value,time_unit
0,password,6.91,years
1,123456,18.52,minutes
2,12345678,1.29,days
3,1234,11.11,seconds
4,qwerty,3.72,days


In [49]:
df.isna().sum()

password     7
value        7
time_unit    7
dtype: int64

In [50]:
for col in list(df.columns):
    print(df[col].value_counts())

password
password    1
tester      1
scott       1
sydney      1
skippy      1
           ..
miller      1
lakers      1
spider      1
monster     1
passw0rd    1
Name: count, Length: 500, dtype: int64
value
3.72     233
3.19      87
6.91      56
3.43      39
7.92      31
18.52     18
11.11     11
1.29       5
2.56       5
3.70       4
92.27      4
3.09       3
1.85       2
1.84       1
17.28      1
Name: count, dtype: int64
time_unit
days       238
months      87
years       65
minutes     51
hours       43
seconds     11
weeks        5
Name: count, dtype: int64


In [149]:
#3
units = {
    "seconds": 1 / 3600,
    "minutes": 1 / 60,
    "days": 24,
    "weeks": 168,
    "months": 720,
    "years": 8760,
}

df['updated_time_unit'] = df['time_unit'].map(units)
df['updated_time_unit'] = df['updated_time_unit']*df['value']

In [150]:
df.dropna(axis=0, inplace=True)

In [151]:
df

Unnamed: 0,password,value,time_unit,updated_time_unit
0,password,6.91,years,60531.600000
1,123456,18.52,minutes,0.308667
2,12345678,1.29,days,30.960000
3,1234,11.11,seconds,0.003086
4,qwerty,3.72,days,89.280000
...,...,...,...,...
495,reddog,3.72,days,89.280000
496,alexande,6.91,years,60531.600000
497,college,3.19,months,2296.800000
498,jester,3.72,days,89.280000


In [152]:
#4
seq = df[df['password'].str.startswith('123')].reset_index(drop=True)
print(len(seq))

8


In [153]:
seq

Unnamed: 0,password,value,time_unit,updated_time_unit
0,123456,18.52,minutes,0.308667
1,12345678,1.29,days,30.96
2,1234,11.11,seconds,0.003086
3,12345,1.85,minutes,0.030833
4,123456789,1.84,weeks,309.12
5,123123,18.52,minutes,0.308667
6,123321,18.52,minutes,0.308667
7,123abc,3.7,weeks,621.6


In [154]:
# corrected
df[df['password'].str.contains(r"^123")]

Unnamed: 0,password,value,time_unit,updated_time_unit
1,123456,18.52,minutes,0.308667
2,12345678,1.29,days,30.96
3,1234,11.11,seconds,0.003086
5,12345,1.85,minutes,0.030833
48,123456789,1.84,weeks,309.12
57,123123,18.52,minutes,0.308667
413,123321,18.52,minutes,0.308667
462,123abc,3.7,weeks,621.6


In [158]:
#5
seq_avg = seq['updated_time_unit'].mean()
df_avg = df['updated_time_unit'].mean()
print('Seq 123 avg time: {:.2f} hours\nAvg time in dataset: {:.2f} hours'.format(seq_avg, df_avg))

Seq 123 avg time: 120.33 hours
Avg time in dataset: 15226.74 hours


In [159]:
df.head()

Unnamed: 0,password,value,time_unit,updated_time_unit
0,password,6.91,years,60531.6
1,123456,18.52,minutes,0.308667
2,12345678,1.29,days,30.96
3,1234,11.11,seconds,0.003086
4,qwerty,3.72,days,89.28


In [95]:
#6
import re
def pswd_not_num(col):
    x = re.search(r'^[A-z]+$', col)
    if x:
        return x
    else:
        return np.nan
df['password'] = df['password'].astype(str)
df_ = df['password'].apply(pswd_not_num)
df_ = df_.dropna()
len(df_)

407

In [161]:
#6
filtered_df_1 = df[df['password'].str.contains(r'^[^0-9]*$')].reset_index(drop=True)
len(filtered_df_1)

407

In [109]:
#7
filtered_df = df[df['password'].str.contains(r'.*[0-9].*')].reset_index(drop=True)
len(filtered_df)

50

In [120]:
#8
avg_diff = filtered_df_1['updated_time_unit'].mean() - filtered_df['updated_time_unit'].mean()
print("Avg difference in online cracking time between passwords that don't conatin a number vs \npasswords that contain atleast one number: {:.2f} hours".format(avg_diff))

Avg difference in online cracking time between passwords that don't conatin a number vs 
passwords that contain atleast one number: -223.05 hours


In [124]:
# 9
sub_str = "[.!?\\-]"
filtered_df_punct = df[df['password'].str.contains(r'[^A-z0-9]+')]
filtered_df_punct

Unnamed: 0,password,value,time_unit,updated_time_unit


In [163]:
#9 corrected
df[df['password'].str.contains(sub_str)]

Unnamed: 0,password,value,time_unit,updated_time_unit


In [141]:
shortest = df.loc[df['updated_time_unit'] == df['updated_time_unit'].min()].head(1)
shortest.iloc[0]

password                 1234
value                   11.11
time_unit             seconds
updated_time_unit    0.000278
Name: 3, dtype: object

In [142]:
#10
print('Shortest time: {} and passwored: {}'.format(shortest['updated_time_unit'], shortest['password']))

Shortest time: 3    0.000278
Name: updated_time_unit, dtype: float64 and passwored: 3    1234
Name: password, dtype: object


In [144]:
longest = df.loc[df['updated_time_unit'] == df['updated_time_unit'].max()].head(1)
longest.iloc[0]

password             password
value                    6.91
time_unit               years
updated_time_unit      8760.0
Name: 0, dtype: object

### Practice probs
Ref:
[Baby Names](https://www.practiceprobs.com/problemsets/python-pandas/series/baby-names/)

In [171]:
import numpy as np
import pandas as pd

babynames = pd.Series([
    'Jathonathon', 'Zeltron', 'Ruger', 'Phreddy', 'Ruger', 'Chad', 'Chad',
    'Ruger', 'Ryan', 'Ruger', 'Chad', 'Ryan', 'Phreddy', 'Phreddy', 'Phreddy',
    'Mister', 'Zeltron', 'Ryan', 'Ruger', 'Ruger', 'Jathonathon',
    'Jathonathon', 'Ruger', 'Chad', 'Zeltron'], dtype='string')
babynames.value_counts()

Ruger          7
Phreddy        4
Chad           4
Jathonathon    3
Zeltron        3
Ryan           3
Mister         1
Name: count, dtype: Int64

### Pandas series problem
[Bees Knees Problem](https://www.practiceprobs.com/problemsets/python-pandas/series/bees-knees/)

In [208]:
bees = pd.Series([True, True, False, np.nan, True, False, True, np.nan])
knees = pd.Series([5,2,9,1,3,10,5,2], index = [7,0,2,6,3,5,1,4])

In [209]:
bees, knees

(0     True
 1     True
 2    False
 3      NaN
 4     True
 5    False
 6     True
 7      NaN
 dtype: object,
 7     5
 0     2
 2     9
 6     1
 3     3
 5    10
 1     5
 4     2
 dtype: int64)

In [213]:
knees.loc[pd.isna(bees).to_numpy()] *= 2
knees

7     5
0     2
2     9
6     4
3     3
5    10
1     5
4     8
dtype: int64