# Practicing Quick Encoding

In [1]:
import pandas as pd 
    
df = pd.DataFrame({'vertebrates':[
'Bird',
'Bird',
'Mammal',
'Fish',
'Amphibian',
'Reptile',
'Mammal',
]})

In [2]:
df

Unnamed: 0,vertebrates
0,Bird
1,Bird
2,Mammal
3,Fish
4,Amphibian
5,Reptile
6,Mammal


In [3]:
df = pd.get_dummies(df, columns=['vertebrates'])

In [4]:
df

Unnamed: 0,vertebrates_Amphibian,vertebrates_Bird,vertebrates_Fish,vertebrates_Mammal,vertebrates_Reptile
0,0,1,0,0,0
1,0,1,0,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,1,0,0,0,0
5,0,0,0,0,1
6,0,0,0,1,0


In [5]:
df = pd.DataFrame({'vertebrates':[
'Bird',
'Bird',
'Mammal',
'Fish',
'Amphibian',
'Reptile',
'Mammal',
]})

In [6]:
df['vertebrates_codes'] = df.vertebrates.astype('category').cat.codes
df.sort_values(['vertebrates_codes'], ascending=False)

Unnamed: 0,vertebrates,vertebrates_codes
5,Reptile,4
2,Mammal,3
6,Mammal,3
3,Fish,2
0,Bird,1
1,Bird,1
4,Amphibian,0


# Practicing Bag of Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "Authman ran faster than Harry because he is an athlete.",
    "Authman and Harry ran faster and faster."
]

bow = CountVectorizer()

X = bow.fit_transform(corpus)

print(X)
print(bow.get_feature_names())
print(X.toarray())

  (0, 2)	1
  (0, 0)	1
  (0, 8)	1
  (0, 7)	1
  (0, 4)	1
  (0, 6)	1
  (0, 10)	1
  (0, 5)	1
  (0, 9)	1
  (0, 3)	1
  (1, 1)	2
  (1, 6)	1
  (1, 5)	2
  (1, 9)	1
  (1, 3)	1
['an', 'and', 'athlete', 'authman', 'because', 'faster', 'harry', 'he', 'is', 'ran', 'than']
[[1 0 1 1 1 1 1 1 1 1 1]
 [0 2 0 1 0 2 1 0 0 1 0]]


# Practicing Folder Looping

In [8]:
import os
files = os.listdir()
files

['.git',
 '.gitattributes',
 '.ipynb_checkpoints',
 'contacts.txt',
 'graphic_cards.csv',
 'LCA_binary_search_tree.py',
 'Learning Material.md',
 'MST_Krustals_Variant.py',
 'my_tiers.xlsx',
 'python_tips.md',
 'Random Practice.ipynb',
 'README.md',
 'SLL_find_from_end.py',
 'Soup_Graphic_Cards.py']

# Unstructured Data Wrangling

In [9]:
import os
import chardet
os.listdir()[3]

'contacts.txt'

In [10]:
import pandas as pd

# dictionary to form the dataframe
connection_dict = {'contact_name': [],
                  'position': [],
                   'timespan': [],
                   'timespan_type': [],
                  'tier_status': [],
                  'company': [],
                  'last_date_contact': [],
                  'shared_interest': []}

# to add contacts not included in the unstructured contact list
adding_contacts = {'contact_name': ["Shantel Vargas",
                                    "Alex Steger",
                                    "Dorcas N Flowers",
                                    "Wilfredo Rodriguez",
                                    "Jennifer Rothenberg",
                                    "Gigi Yuen-Reed"],
                  'position': ["Master Student", 
                              "Equity Analyst",
                              "Special Education Teacher",
                              "City of Dallas",
                              "Marketing & Data Science",
                              "Data Science Solution for Healthcare"],
                   'timespan': ["2",
                               "3",
                               "33",
                               "6",
                               None,
                               None],
                   'timespan_type': ["years",
                                    "years",
                                    "years",
                                    "years",
                                    None,
                                    None],
                  'tier_status': ["tier1", 
                                 "tier2",
                                 "tier1",
                                 "tier2",
                                 "tier3",
                                 "tier3"],
                   'company': ["Unemployed",
                              "National Investment Services",
                              "Unemployed",
                              "City of Dallas",
                              "Unemployed",
                              "IBM"],
                   'last_date_contact': ["05/1/2016",
                                        "04/1/2016",
                                        "04/1/2018",
                                        "02/1/2018",
                                        None,
                                        None],
                   'shared_interest': ["Data Analysis",
                                      "Analysis",
                                      "Life",
                                      "Programing",
                                      "Data Analysis",
                                      "Data Analysis"]}

# dictionary to populate the tiers
tier_dict = {'tier1': 
             {'name':["Lauren O'Farrill", 
                    "John Carnevalla Jr",
                    "MARIA RAMOS ISIDOR, MAcc",
                    "Yaileen Garza",
                    "Lydia Montano"],
             'Company':["Ditech Holding Corporation",
                       "ConnectWise",
                       "PwC",
                       "HealthTrust Worforce Solutions",
                       "Florida Family Primary Care Centers"],
             'Email Address':["laurenofarrill@gmail.com",
                             "jcarnevalla21@yahoo.com",
                             "mramosis.mri@gmail.com",
                             "rodyaileen@outlook.com",
                             "lydia4lifeint822016@gmail.com"],
             'Last Date Contacted':["06/1/2017",
                                   "04/1/2018",
                                   "03/1/2018",
                                   "04/2/2018",
                                   "03/20/2018"],
             'Shared Interest':["Finance",
                               "Programing",
                               "Accounting",
                               "Billing",
                               "Programing"]},
            'tier2': 
             {'name':["David Lievano",
                     "Junior Sainval",
                     "T. Hudson White III",
                     "Nick Biengardo",
                     "Ashley Borde",
                     "Lexy Scarpiello",
                     "Rami Siab",
                     "Josani Schneider",
                     "Alyssa Mason",
                     "Brian Earnest",
                     "Maruja Azar Leanos",
                     "Mike Bowen",
                     "Nikki Stowell",
                     "Stephen Baricko",
                     "Pravishka Wickramasuriya",
                     "Ruben Madamba",
                     "Spencer Crawford",
                     "Única Channa",
                     "Dan Bell",
                     "Daniel LeBlanc",
                     "Carolyn Ebanks",
                     "Charles Mardook",
                     "Julian Brown"],
             'Company':["State Farm",
                       "PwC",
                       "PwC",
                       "Unemployed",
                       "Unemployed",
                       "Comcast",
                       "Planet Dodge Chrysler Jeep Ram",
                        "Caymen Islands Monetary Authority",
                        "The Depository Trust and Clearing Corporation",
                        "Unemployed",
                        "Mannaz Designs",
                        "University of South Florida",
                        "University of South Florida",
                        "Freedman's Office Furniture and Supplies",
                        "BB&T",
                        "Alere Home Monitoring",
                        "Machine Zone",
                        "Alere Home Monitoring",
                        "NerdHire",
                        "Unemployed",
                        "Children's Board of Hillsborough County",
                        "Self-Employed Consultant",
                        "Citi"
                       ],
             'Last Date Contacted':["12/1/2016",
                                   "10/1/2016",
                                   "12/20/2017",
                                   "12/1/2016",
                                   "12/1/2016",
                                   "10/1/2016",
                                   "12/1/2016",
                                   "12/1/2016",
                                   "12/1/2016",
                                   "10/1/2016",
                                   "12/1/2016",
                                   "05/1/2016",
                                   "10/1/2016",
                                   "06/1/2016",
                                   "03/1/2018",
                                   "02/1/2018",
                                   "12/1/2018",
                                   "01/20/2018",
                                   "02/1/2018",
                                   "10/1/2016",
                                   "05/1/2016",
                                   "12/1/2016",
                                   "12/1/2016"],
             'Shared Interest':["Finance", 
                               "Accounting",
                               "Finance",
                               "Business",
                               "Business",
                               "Finance",
                               "Finance",
                               "Finance",
                               "Finance",
                               "Entrepreneurship",
                               "Business",
                               "Finance",
                               "Business",
                               "Finance",
                               "Data Analysis",
                               "Video Games",
                               "Business",
                               "",
                               "Technology",
                               "Finance",
                               "Budget Analysis",
                               "Business",
                               "Finance"]},
            'tier3': 
             {'name':["Renee Murphy",
                     "Jamie Kelly",
                     "Maria Isabel Caicedo",
                     "Ralph Herz",
                     "Olga Leontyeva, MS, PMP, CSPO",
                     "Sireesha Pulipati",
                     "Madhuvanthi Kandadai",
                     "Fiona Huo",
                     "Perri Ma",
                     "Maria Kavaliova",
                     "Shaquille Powell, MBA",
                     "Chelsea Jone",
                     "Renee Manneh",
                     "Costa Stamatinos",
                     "Nazia Habib",
                     "Scott Provencher",
                     "Aya Masuo",
                     "Mihwa Han"],
             'Company':["Precision Health Technologies",
                       "Tesla",
                       "Unemployed",
                       "BB&T",
                       "Alere Home Monitoring",
                       "Genomic Health",
                       "Corium International",
                       "Unemployed",
                       "Warner Brothers",
                       "Oportun",
                       "Harnham",
                       "Bellator Recruiting Academy",
                       "Voyage",
                       "Unemployed"
                       "Houston GMAT",
                       "Unemployed",
                       "Unemployed",
                       "Unemployed",
                       "Unemployed"],
             'Last Date Contacted':[None,
                                   "03/1/2018",
                                   "03/1/2018",
                                   "12/1/2016",
                                   "12/1/2017",
                                   "02/1/2018",
                                   "04/3/2018",
                                   "02/1/2018",
                                   "03/1/2018",
                                   "02/1/2018",
                                   "07/1/2017",
                                   "03/1/2018",
                                   "03/1/2018",
                                   None,
                                   None,
                                   "02/1/2018",
                                   None,
                                   None],
             'Shared Interest':[None,
                               "Programing",
                               "Data Analysis",
                               "Finance",
                               "Business Analysis",
                               "Data Analysis",
                               "Statistics",
                               "Data Analysis",
                               "Data Analysis",
                               "Data Analysis",
                               "Military",
                               "Self-Driving Vehicles",
                               "Data Analysis",
                               "Data Analysis",
                               "Data Analysis",
                               "Data Analysis",
                               "Data Analysis",
                               "Data Analysis"]}}

In [11]:
total_contacts = len(tier_dict['tier2']['name']) + \
                    len(tier_dict['tier1']['name']) + \
                    len(tier_dict['tier3']['name']) + \
                    len(adding_contacts['contact_name'])
print("You have {} contacts.".format(total_contacts))

with open('contacts.txt', 'rb') as my_contacts:
    data_lines = my_contacts.read()
    
data_list = data_lines.decode('utf-8').replace(u'’', '').strip().split('\r\n')

for i, value in enumerate(data_list):
    #print(value)
    if value == "Members name":
        #print(data_list[i+1])
        connection_dict['contact_name'].append(data_list[i+1])
    elif value == "Members occupation":
        #print(data_list[i+1])
        connection_dict['position'].append(data_list[i+1])
    elif value.find('Connected') >= 0:
        #print(data_list[i])
        connection_split = data_list[i].split(' ')
        connection_dict['timespan'].append(connection_split[1])
        connection_dict['timespan_type'].append(connection_split[2])
        
    # setting tiers
    if value == "Members name":
        contact_name = data_list[i+1]
        if contact_name in tier_dict['tier1']['name']:
            
            # appending tier1
            connection_dict['tier_status'].append('tier1')
            
            # appending other info            
            index_target = tier_dict['tier1']['name'].index(contact_name)
            connection_dict['company'].append(tier_dict['tier1']['Company'][index_target])
            connection_dict['last_date_contact'].append(tier_dict['tier1']['Last Date Contacted'][index_target])
            connection_dict['shared_interest'].append(tier_dict['tier1']['Shared Interest'][index_target])
                                  
        elif contact_name in tier_dict['tier2']['name']:
            
            # appending tier2
            connection_dict['tier_status'].append('tier2')
            
            # appending other info            
            index_target = tier_dict['tier2']['name'].index(contact_name)
            connection_dict['company'].append(tier_dict['tier2']['Company'][index_target])
            connection_dict['last_date_contact'].append(tier_dict['tier2']['Last Date Contacted'][index_target])
            connection_dict['shared_interest'].append(tier_dict['tier2']['Shared Interest'][index_target])
            
        elif contact_name in tier_dict['tier3']['name']:
            
            # appending tier3
            connection_dict['tier_status'].append('tier3')
            
            # appending other info            
            index_target = tier_dict['tier3']['name'].index(contact_name)
            connection_dict['company'].append(tier_dict['tier3']['Company'][index_target])
            connection_dict['last_date_contact'].append(tier_dict['tier3']['Last Date Contacted'][index_target])
            connection_dict['shared_interest'].append(tier_dict['tier3']['Shared Interest'][index_target])
            
        else:
            connection_dict['tier_status'].append(None)
            connection_dict['company'].append(None)
            connection_dict['last_date_contact'].append(None)
            connection_dict['shared_interest'].append(None)

            
for key in connection_dict.keys():
    print(len(connection_dict[key]), key)
            
df1 = pd.DataFrame(adding_contacts)

df = pd.DataFrame(connection_dict)
df = df.append(df1)
df.reset_index(drop=True, inplace=True)

condition = (df.tier_status.isna() == False)
df_tier = df.loc[condition]

You have 52 contacts.
120 contact_name
120 position
120 timespan
120 timespan_type
120 tier_status
120 company
120 last_date_contact
120 shared_interest


In [12]:
df_tier.head()

Unnamed: 0,company,contact_name,last_date_contact,position,shared_interest,tier_status,timespan,timespan_type
0,Unemployed,Mihwa Han,,Data Scientist | Machine Learning Engineer | D...,Data Analysis,tier3,2,hours
7,Voyage,Renee Manneh,03/1/2018,"Operations, Self-Driving Vehicles",Data Analysis,tier3,5,days
13,Oportun,Maria Kavaliova,02/1/2018,"Principal Data Scientist, Senior Manager",Data Analysis,tier3,7,days
14,UnemployedHouston GMAT,Costa Stamatinos,,"Student at Udacity, Coursera, EdX",Data Analysis,tier3,7,days
17,Unemployed,Maria Isabel Caicedo,03/1/2018,Digital Marketing | Performance Management | B...,Data Analysis,tier3,1,week


In [13]:
df_tier.count()

company              51
contact_name         51
last_date_contact    44
position             51
shared_interest      50
tier_status          51
timespan             49
timespan_type        49
dtype: int64

In [14]:
df_tier.to_csv('my_tiers.csv')