In [7]:
import requests
from bs4 import BeautifulSoup
import sys
from os import path
import csv
import pandas as pd
from sklearn.pipeline import make_pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import re
import matplotlib.pyplot as plt
from nltk.stem import RSLPStemmer
import text_unidecode as unidecode

Load DF

In [3]:
interviews_df = pd.read_csv('interview_data.csv', sep=',', encoding='cp1252')

In [4]:
interviews_df.head()

Unnamed: 0,Company,Interview Questions,Date,Candidate,Offer?,Experience,Difficulty of Interview,How the candidate applied,Process,Who found review helpful
0,Google,Why you ? What brough tyou here ?,30-May-23,Anonymous Interview Candidate in Zürich,Declined Offer,Neutral Experience,Difficult Interview,I interviewed at Google (Zürich),It wa s avery smooth interviw . I really liked...,Be the first to find this interview helpful\nH...
1,Google,How would you optimize a database query? How w...,18-May-23,Anonymous Interview Candidate,No Offer,Positive Experience,Average Interview,I interviewed at Google,They asked me about my technical skills and st...,2 people found this interview helpful\nHelpful...
2,Google,predict some metrics using regression,4-May-23,Anonymous Interview Candidate in Mountain View...,No Offer,Positive Experience,Difficult Interview,"I interviewed at Google (Mountain View, CA)",brainstorm some statistics question with the i...,1 person found this interview helpful\nHelpful...
3,Google,How would you forecast a brands sales.,2-May-23,"Anonymous Interview Candidate in London, England",No Offer,Neutral Experience,Difficult Interview,"I interviewed at Google (London, England)",Few relatively simple technical questions. Ver...,1 person found this interview helpful\nHelpful...
4,Google,I signed an NDA cannot disclose those,27-Mar-23,Anonymous Employee,Accepted Offer,Positive Experience,Difficult Interview,I interviewed at Google,There is s a tech screen and then an onsite. Y...,1 person found this interview helpful\nHelpful...


Clean text in DF

In [15]:
nltk.download('rslp')

[nltk_data] Downloading package rslp to /home/conls91/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


True

In [45]:
def cleaning(sentence):
    # Lower case all text
    sentence = sentence.lower()
    
    # Remove all numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # Remove all punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    
    # Tokenize 
    tokenized_sentence = word_tokenize(sentence)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokenized_sentence = [w for w in tokenized_sentence if not w in stop_words]
    
    # Join tokenized sentence back together
    cleaned_sentence = ' '.join(tokenized_sentence)
    
    return cleaned_sentence

In [46]:
interviews_df_cleaned = interviews_df.copy()
interviews_df_cleaned.head()

Unnamed: 0,Company,Interview Questions,Date,Candidate,Offer?,Experience,Difficulty of Interview,How the candidate applied,Process,Who found review helpful
0,Google,Why you ? What brough tyou here ?,30-May-23,Anonymous Interview Candidate in Zürich,Declined Offer,Neutral Experience,Difficult Interview,I interviewed at Google (Zürich),It wa s avery smooth interviw . I really liked...,Be the first to find this interview helpful\nH...
1,Google,How would you optimize a database query? How w...,18-May-23,Anonymous Interview Candidate,No Offer,Positive Experience,Average Interview,I interviewed at Google,They asked me about my technical skills and st...,2 people found this interview helpful\nHelpful...
2,Google,predict some metrics using regression,4-May-23,Anonymous Interview Candidate in Mountain View...,No Offer,Positive Experience,Difficult Interview,"I interviewed at Google (Mountain View, CA)",brainstorm some statistics question with the i...,1 person found this interview helpful\nHelpful...
3,Google,How would you forecast a brands sales.,2-May-23,"Anonymous Interview Candidate in London, England",No Offer,Neutral Experience,Difficult Interview,"I interviewed at Google (London, England)",Few relatively simple technical questions. Ver...,1 person found this interview helpful\nHelpful...
4,Google,I signed an NDA cannot disclose those,27-Mar-23,Anonymous Employee,Accepted Offer,Positive Experience,Difficult Interview,I interviewed at Google,There is s a tech screen and then an onsite. Y...,1 person found this interview helpful\nHelpful...


In [47]:
interviews_df_cleaned['Interview Questions'] = interviews_df_cleaned['Interview Questions'].apply(cleaning)

In [49]:
interviews_df_cleaned['Date'] = pd.to_datetime(interviews_df_cleaned['Date'])

  interviews_df_cleaned['Date'] = pd.to_datetime(interviews_df_cleaned['Date'])


In [50]:
interviews_df_cleaned['Offer?'] = interviews_df_cleaned['Offer?'].str.lower()

In [51]:
interviews_df_cleaned['Experience '] = interviews_df_cleaned['Experience '].str.lower()

In [52]:
interviews_df_cleaned['Difficulty of Interview'] = interviews_df_cleaned['Difficulty of Interview'].str.lower()

In [53]:
interviews_df_cleaned['Candidate'] = interviews_df_cleaned['Candidate'].apply(cleaning)

In [56]:
interviews_df_cleaned.head()

Unnamed: 0,Company,Interview Questions,Date,Candidate,Offer?,Experience,Difficulty of Interview,How the candidate applied,Process,Who found review helpful
0,Google,brough tyou,2023-05-30,anonymous interview candidate zürich,declined offer,neutral experience,difficult interview,I interviewed at Google (Zürich),It wa s avery smooth interviw . I really liked...,Be the first to find this interview helpful\nH...
1,Google,would optimize database query would evaluate p...,2023-05-18,anonymous interview candidate,no offer,positive experience,average interview,I interviewed at Google,They asked me about my technical skills and st...,2 people found this interview helpful\nHelpful...
2,Google,predict metrics using regression,2023-05-04,anonymous interview candidate mountain view ca,no offer,positive experience,difficult interview,"I interviewed at Google (Mountain View, CA)",brainstorm some statistics question with the i...,1 person found this interview helpful\nHelpful...
3,Google,would forecast brands sales,2023-05-02,anonymous interview candidate london england,no offer,neutral experience,difficult interview,"I interviewed at Google (London, England)",Few relatively simple technical questions. Ver...,1 person found this interview helpful\nHelpful...
4,Google,signed nda disclose,2023-03-27,anonymous employee,accepted offer,positive experience,difficult interview,I interviewed at Google,There is s a tech screen and then an onsite. Y...,1 person found this interview helpful\nHelpful...


Find rate of offers given to candidates per company

In [57]:
meta_df_cleaned = interviews_df_cleaned[(interviews_df_cleaned['Company'] == 'Meta')]
google_df_cleaned = interviews_df_cleaned[(interviews_df_cleaned['Company'] == 'Google')]
amazon_df_cleaned = interviews_df_cleaned[(interviews_df_cleaned['Company'] == 'Amazon')]
microsoft_df_cleaned = interviews_df_cleaned[(interviews_df_cleaned['Company'] == 'Microsoft')]
apple_df_cleaned = interviews_df_cleaned[(interviews_df_cleaned['Company'] == 'Apple')]

In [74]:
print(meta_df_cleaned['Difficulty of Interview'].value_counts())
print(google_df_cleaned['Difficulty of Interview'].value_counts())
print(amazon_df_cleaned['Difficulty of Interview'].value_counts())
print(microsoft_df_cleaned['Difficulty of Interview'].value_counts())
print(apple_df_cleaned['Difficulty of Interview'].value_counts())

Difficulty of Interview
average interview      105
difficult interview     39
easy interview           6
Name: count, dtype: int64
Difficulty of Interview
average interview      66
difficult interview    37
easy interview          7
Name: count, dtype: int64
Difficulty of Interview
average interview      113
difficult interview     29
easy interview          18
Name: count, dtype: int64
Difficulty of Interview
average interview      97
difficult interview    44
easy interview         29
Name: count, dtype: int64
Difficulty of Interview
average interview      23
difficult interview    14
easy interview          4
Name: count, dtype: int64


In [76]:
company_cleaned_df = [meta_df_cleaned, google_df_cleaned, amazon_df_cleaned, microsoft_df_cleaned, apple_df_cleaned]
name_cleaned_df = ['Meta', 'Google', 'Amazon', 'Microsoft', 'Apple']
for company, name in zip(company_cleaned_df, name_cleaned_df):
    company_offer_percentage = ((company['Offer?'].value_counts()[1] + company['Offer?'].value_counts()[2]) / (company['Offer?'].value_counts()[0] + company['Offer?'].value_counts()[1] + company['Offer?'].value_counts()[2]) ) * 100
    print( f'Only {round(company_offer_percentage, 2)}% of candidates receive job offers for {name}')

Only 21.33% of candidates receive job offers for Meta
Only 13.64% of candidates receive job offers for Google
Only 21.88% of candidates receive job offers for Amazon
Only 35.29% of candidates receive job offers for Microsoft
Only 24.39% of candidates receive job offers for Apple


In [78]:
for company, name in zip(company_cleaned_df, name_cleaned_df):
    company_difficulty = (company['Difficulty of Interview'].value_counts()[1] / (company['Difficulty of Interview'].value_counts()[0] + company['Difficulty of Interview'].value_counts()[1] + company['Difficulty of Interview'].value_counts()[2])) * 100
    print( f'{round(company_difficulty, 2)}% of candidates describe the interview process as difficult for {name}')

26.0% of candidates describe the interview process as difficult for Meta
33.64% of candidates describe the interview process as difficult for Google
18.12% of candidates describe the interview process as difficult for Amazon
25.88% of candidates describe the interview process as difficult for Microsoft
34.15% of candidates describe the interview process as difficult for Apple


In [79]:
for company, name in zip(company_cleaned_df, name_cleaned_df):
    company_difficulty = (company['Difficulty of Interview'].value_counts()[0] / (company['Difficulty of Interview'].value_counts()[0] + company['Difficulty of Interview'].value_counts()[1] + company['Difficulty of Interview'].value_counts()[2])) * 100
    print( f'{round(company_difficulty, 2)}% of candidates describe the interview process as average for {name}')

70.0% of candidates describe the interview process as average for Meta
60.0% of candidates describe the interview process as average for Google
70.62% of candidates describe the interview process as average for Amazon
57.06% of candidates describe the interview process as average for Microsoft
56.1% of candidates describe the interview process as average for Apple


Analysis of interview questions

All companies

In [82]:
interview_questions_df = interviews_df_cleaned[['Company', 'Interview Questions']] 

In [83]:
interview_questions_df

Unnamed: 0,Company,Interview Questions
0,Google,brough tyou
1,Google,would optimize database query would evaluate p...
2,Google,predict metrics using regression
3,Google,would forecast brands sales
4,Google,signed nda disclose
...,...,...
626,Meta,many orders fries mcdonalds sell year
627,Meta,perform sql join ab test case study resume dee...
628,Meta,given list search consecutive n numbers whose ...
629,Meta,whats favorite fb product improve


In [155]:
vectorizer = TfidfVectorizer(ngram_range=(3,3), min_df=0.001, max_df = 0.75)
vectorized_questions = pd.DataFrame(vectorizer.fit_transform(interview_questions_df['Interview Questions']).toarray(), columns = vectorizer.get_feature_names_out())

vectorized_questions.loc['Total'] = vectorized_questions.sum(numeric_only=True, axis=0)

In [156]:
vectorized_questions = vectorized_questions.sort_values(vectorized_questions.last_valid_index(), axis=1, ascending=False)

In [157]:
sorted_word_list = [(col, vectorized_questions[col].iloc[-1]) for col in vectorized_questions.columns]

In [158]:
sorted_word_list

[('machine learning questions', 2.265013675431684),
 ('tell time questions', 2.225727140435356),
 ('would measure success', 2.0771499818669072),
 ('technical questions asked', 2.0),
 ('question overfitting underfitting', 2.0),
 ('experience data science', 1.9168559038175053),
 ('statistics machine learning', 1.736092908904114),
 ('questions data science', 1.633338509046447),
 ('measure success product', 1.4721858178082279),
 ('standard error mean', 1.467252189122276),
 ('prior experience data', 1.4466010737333335),
 ('probability statistics programming', 1.414213562373095),
 ('recommendation system design', 1.414213562373095),
 ('matrix leet code', 1.414213562373095),
 ('spiral within grid', 1.414213562373095),
 ('coding talk modelsdata', 1.414213562373095),
 ('much coding talk', 1.414213562373095),
 ('code spiral within', 1.414213562373095),
 ('tell time lead', 1.414213562373095),
 ('spiral matrix leet', 1.414213562373095),
 ('anomalies find patterns', 1.414213562373095),
 ('basic pro

Individual companies

In [162]:
meta_interview_questions_df = interview_questions_df[(interview_questions_df['Company'] == 'Meta')]
apple_interview_questions_df = interview_questions_df[(interview_questions_df['Company'] == 'Apple')]
google_interview_questions_df = interview_questions_df[(interview_questions_df['Company'] == 'Google')]
microsoft_interview_questions_df = interview_questions_df[(interview_questions_df['Company'] == 'Microsoft')]
amazon_interview_questions_df = interview_questions_df[(interview_questions_df['Company'] == 'Amazon')]

Apple TFIDF Interview Questions

In [214]:
apple_vectorizer = TfidfVectorizer(ngram_range=(3,3), min_df=0.01, max_df = 0.75)
apple_vectorized_questions = pd.DataFrame(apple_vectorizer.fit_transform(apple_interview_questions_df['Interview Questions']).toarray(), columns = apple_vectorizer.get_feature_names_out())
apple_vectorized_questions.loc['Total'] = apple_vectorized_questions.sum(numeric_only=True, axis=0)
apple_vectorized_questions = apple_vectorized_questions.sort_values(apple_vectorized_questions.last_valid_index(), axis=1, ascending=False)
apple_sorted_word_list = [(col, apple_vectorized_questions[col].iloc[-1]) for col in apple_vectorized_questions.columns]
apple_sorted_word_list

[('tell know displays', 1.0),
 ('asked projects resume', 1.0),
 ('tell current responsibilities', 1.0),
 ('strength relates analysis', 1.0),
 ('calculate sample size', 1.0),
 ('question based resume', 1.0),
 ('conduct ab test', 0.7071067811865475),
 ('sql basic questions', 0.7071067811865475),
 ('use evaluate model', 0.7071067811865475),
 ('specially internship experiences', 0.7071067811865475),
 ('resume job posting', 0.7071067811865475),
 ('questions domain knowledge', 0.7071067811865475),
 ('question set cover', 0.7071067811865475),
 ('basic questions etc', 0.7071067811865475),
 ('basic questions domain', 0.7071067811865475),
 ('asked question set', 0.7071067811865475),
 ('background specially internship', 0.7071067811865475),
 ('items resume job', 0.7071067811865475),
 ('metrics use evaluate', 0.7071067811865475),
 ('would conduct ab', 0.7071067811865475),
 ('approach problems deadline', 0.7071067811865475),
 ('problems deadline environment', 0.7071067811865475),
 ('knn nearest nei

Google TFIDF Interview Questions

In [166]:
google_vectorizer = TfidfVectorizer(ngram_range=(3,3), min_df=0.001, max_df = 0.75)
google_vectorized_questions = pd.DataFrame(google_vectorizer.fit_transform(google_interview_questions_df['Interview Questions']).toarray(), columns = google_vectorizer.get_feature_names_out())
google_vectorized_questions.loc['Total'] = google_vectorized_questions.sum(numeric_only=True, axis=0)
google_vectorized_questions = google_vectorized_questions.sort_values(google_vectorized_questions.last_valid_index(), axis=1, ascending=False)
google_sorted_word_list = [(col, google_vectorized_questions[col].iloc[-1]) for col in google_vectorized_questions.columns]
google_sorted_word_list

[('statistics machine learning', 1.7071067811865475),
 ('standard error mean', 1.449686985531412),
 ('machine learning questions', 1.252067208627597),
 ('prefer google apple', 1.0),
 ('window function group', 1.0),
 ('describe pca works', 1.0),
 ('explain sql works', 1.0),
 ('introduce question probality', 1.0),
 ('signed nda disclose', 1.0),
 ('responsible future job', 1.0),
 ('projects worked phd', 1.0),
 ('unfair coin fair', 0.8000214916751375),
 ('assumption error linear', 0.8000214916751375),
 ('make unfair coin', 0.8000214916751375),
 ('error linear regression', 0.8000214916751375),
 ('write code generate', 0.7526188540958284),
 ('hash table python', 0.7071067811865475),
 ('given coin biased', 0.7071067811865475),
 ('forecast brands sales', 0.7071067811865475),
 ('find width confidence', 0.7071067811865475),
 ('questions ab test', 0.7071067811865475),
 ('explain prior project', 0.7071067811865475),
 ('basic coding simulation', 0.7071067811865475),
 ('probability integral transfor

Amazon TFIDF Interview Questions

In [167]:
amazon_vectorizer = TfidfVectorizer(ngram_range=(3,3), min_df=0.001, max_df = 0.75)
amazon_vectorized_questions = pd.DataFrame(amazon_vectorizer.fit_transform(amazon_interview_questions_df['Interview Questions']).toarray(), columns = amazon_vectorizer.get_feature_names_out())
amazon_vectorized_questions.loc['Total'] = amazon_vectorized_questions.sum(numeric_only=True, axis=0)
amazon_vectorized_questions = amazon_vectorized_questions.sort_values(amazon_vectorized_questions.last_valid_index(), axis=1, ascending=False)
amazon_sorted_word_list = [(col, amazon_vectorized_questions[col].iloc[-1]) for col in amazon_vectorized_questions.columns]
amazon_sorted_word_list

[('explain one project', 1.357886305319185),
 ('tell time questions', 1.2322411070622041),
 ('difference bagging boosting', 1.2129655239136254),
 ('sql machine learning', 1.0182599298148625),
 ('everything sun moon', 1.0),
 ('latest invention think', 1.0),
 ('tell answer questions', 1.0),
 ('basic ml questions', 1.0),
 ('tell time failed', 1.0),
 ('standard behavioral questions', 1.0),
 ('quants logical reasoning', 1.0),
 ('na na na', 1.0),
 ('many behavior questions', 1.0),
 ('value chase study', 1.0),
 ('amazon basics engineering', 1.0),
 ('visualize multidimensional data', 1.0),
 ('convolution matrixmatrix multiplication', 1.0),
 ('difference boosting bagging', 1.0),
 ('describe project proud', 1.0),
 ('write sql python', 1.0),
 ('firstround check cv', 1.0),
 ('write code xyz', 1.0),
 ('blah blah blah', 0.857142857142857),
 ('one project delivered', 0.7341908440294345),
 ('one project cv', 0.7341908440294345),
 ('describe pervious experience', 0.7071067811865476),
 ('case study dema

Meta TFIDF Interview Questions

In [230]:
meta_vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df=0.001, max_df = 0.75)
meta_vectorized_questions = pd.DataFrame(meta_vectorizer.fit_transform(meta_interview_questions_df['Interview Questions']).toarray(), columns = meta_vectorizer.get_feature_names_out())
meta_vectorized_questions.loc['Total'] = meta_vectorized_questions.sum(numeric_only=True, axis=0)
meta_vectorized_questions = meta_vectorized_questions.sort_values(meta_vectorized_questions.last_valid_index(), axis=1, ascending=False)
meta_sorted_word_list = [(col, meta_vectorized_questions[col].iloc[-1]) for col in meta_vectorized_questions.columns]
meta_sorted_word_list

[('would measure', 2.4971994749000714),
 ('measure success', 2.0430502577651692),
 ('case study', 1.9100900205807525),
 ('ab testing', 1.8776597451386603),
 ('sql question', 1.8524711067702948),
 ('sql python', 1.5122788943595629),
 ('sql questions', 1.4609428111647118),
 ('asked questions', 1.4247006615168336),
 ('best friends', 1.4040395444209968),
 ('product sense', 1.3738917729995306),
 ('ab test', 1.355154562631638),
 ('retention rate', 1.3528927912958457),
 ('success product', 1.2966906420113546),
 ('machine learning', 1.2358010292559645),
 ('previous experience', 1.2254876110717614),
 ('determine best', 1.2241900740786422),
 ('sql product', 1.1973458852211027),
 ('news feed', 1.1509704440164734),
 ('would estimate', 1.0980377384727014),
 ('tell time', 1.0979293949709037),
 ('career goals', 1.0),
 ('lot experiences', 1.0),
 ('subject studied', 1.0),
 ('please tell', 1.0),
 ('prove assumption', 1.0),
 ('sql case', 0.9808538366521404),
 ('business case', 0.9554282317149649),
 ('int

Microsoft TFIDF Interview Questions

In [244]:
microsoft_vectorizer = TfidfVectorizer(ngram_range=(3,3), min_df=0.001, max_df = 0.75)
microsoft_vectorized_questions = pd.DataFrame(microsoft_vectorizer.fit_transform(microsoft_interview_questions_df['Interview Questions']).toarray(), columns = microsoft_vectorizer.get_feature_names_out())
microsoft_vectorized_questions.loc['Total'] = microsoft_vectorized_questions.sum(numeric_only=True, axis=0)
microsoft_vectorized_questions = microsoft_vectorized_questions.sort_values(microsoft_vectorized_questions.last_valid_index(), axis=1, ascending=False)
microsoft_sorted_word_list = [(col, microsoft_vectorized_questions[col].iloc[-1]) for col in microsoft_vectorized_questions.columns]
microsoft_sorted_word_list

[('question overfitting underfitting', 2.0),
 ('technical questions asked', 2.0),
 ('experience data science', 1.8972878104868651),
 ('questions data science', 1.6138547091337885),
 ('prior experience data', 1.4551079118247503),
 ('tell time lead', 1.4142135623730951),
 ('basic probability statistics', 1.4142135623730951),
 ('probability statistics programming', 1.4142135623730951),
 ('find patterns came', 1.4142135623730951),
 ('system design improvement', 1.4142135623730951),
 ('recommendation system design', 1.4142135623730951),
 ('code spiral within', 1.4142135623730951),
 ('coding talk modelsdata', 1.4142135623730951),
 ('much coding talk', 1.4142135623730951),
 ('spiral within grid', 1.4142135623730951),
 ('matrix leet code', 1.4142135623730951),
 ('spiral matrix leet', 1.4142135623730951),
 ('time lead team', 1.4142135623730951),
 ('anomalies find patterns', 1.4142135623730951),
 ('describe experiences interest', 1.1547005383792515),
 ('compose pipeline obtaining', 1.15470053837

Analysis of interview process for each company

In [159]:
process_df = interviews_df_cleaned[['Company', 'Process']] 

In [160]:
process_df.head()

Unnamed: 0,Company,Process
0,Google,It wa s avery smooth interviw . I really liked...
1,Google,They asked me about my technical skills and st...
2,Google,brainstorm some statistics question with the i...
3,Google,Few relatively simple technical questions. Ver...
4,Google,There is s a tech screen and then an onsite. Y...


In [198]:
# Meta df and clean
meta_process_df = process_df[(process_df['Company'] == 'Meta')]
meta_process_df['Process'] = meta_process_df['Process'].apply(cleaning)
# Apple df and clean
apple_process_df = process_df[(process_df['Company'] == 'Apple')]
apple_process_df['Process'] = apple_process_df['Process'].apply(cleaning)
# Google df and clean
google_process_df = process_df[(process_df['Company'] == 'Google')]
google_process_df['Process'] = google_process_df['Process'].apply(cleaning)
# Microsoft df and clean
microsoft_process_df = process_df[(process_df['Company'] == 'Microsoft')]
microsoft_process_df['Process'] = microsoft_process_df['Process'].apply(cleaning)
# Amazon df and clean
amazon_process_df = process_df[(process_df['Company'] == 'Amazon')]
amazon_process_df = amazon_process_df.dropna().copy()
amazon_process_df['Process'] = amazon_process_df['Process'].apply(cleaning)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_process_df['Process'] = meta_process_df['Process'].apply(cleaning)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_process_df['Process'] = apple_process_df['Process'].apply(cleaning)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  google_process_df['Process'] = google_process_df['Process']

Meta TFIDF Process

In [199]:
metaprocess_vectorizer = TfidfVectorizer(ngram_range=(5,5), min_df=0.001, max_df = 0.75)
meta_vectorized_process = pd.DataFrame(metaprocess_vectorizer.fit_transform(meta_process_df['Process']).toarray(), columns = metaprocess_vectorizer.get_feature_names_out())
meta_vectorized_process.loc['Total'] = meta_vectorized_process.sum(numeric_only=True, axis=0)
meta_vectorized_process = meta_vectorized_process.sort_values(meta_vectorized_process.last_valid_index(), axis=1, ascending=False)
metaprocess_sorted_word_list = [(col, meta_vectorized_process[col].iloc[-1]) for col in meta_vectorized_process.columns]
metaprocess_sorted_word_list

[('job experience wanted know salary', 0.33333333333333337),
 ('interview many question previous job', 0.33333333333333337),
 ('question previous job experience wanted', 0.33333333333333337),
 ('interview process difficult interview many', 0.33333333333333337),
 ('experience wanted know salary expectations', 0.33333333333333337),
 ('process difficult interview many question', 0.33333333333333337),
 ('previous job experience wanted know', 0.33333333333333337),
 ('many question previous job experience', 0.33333333333333337),
 ('difficult interview many question previous', 0.33333333333333337),
 ('metrics please max would use', 0.32075886286137795),
 ('content news feed product signals', 0.32075886286137795),
 ('let say want figure facebook', 0.32075886286137795),
 ('max would use determine user', 0.32075886286137795),
 ('please max would use determine', 0.32075886286137795),
 ('want figure facebook instagram user', 0.32075886286137795),
 ('user best friend prioritize showing', 0.32075886

Apple TFIDF Process

In [200]:
appleprocess_vectorizer = TfidfVectorizer(ngram_range=(5,5), min_df=0.001, max_df = 0.75)
apple_vectorized_process = pd.DataFrame(appleprocess_vectorizer.fit_transform(apple_process_df['Process']).toarray(), columns = appleprocess_vectorizer.get_feature_names_out())
apple_vectorized_process.loc['Total'] = apple_vectorized_process.sum(numeric_only=True, axis=0)
apple_vectorized_process = apple_vectorized_process.sort_values(apple_vectorized_process.last_valid_index(), axis=1, ascending=False)
appleprocess_sorted_word_list = [(col, apple_vectorized_process[col].iloc[-1]) for col in apple_vectorized_process.columns]
appleprocess_sorted_word_list

[('process takes long get feedback', 0.31622776601683794),
 ('interviewer well prepared time short', 0.31622776601683794),
 ('rough process takes long get', 0.31622776601683794),
 ('well prepared time short rough', 0.31622776601683794),
 ('length questions broad interviewer well', 0.31622776601683794),
 ('questions broad interviewer well prepared', 0.31622776601683794),
 ('broad interviewer well prepared time', 0.31622776601683794),
 ('time short rough process takes', 0.31622776601683794),
 ('prepared time short rough process', 0.31622776601683794),
 ('short rough process takes long', 0.31622776601683794),
 ('people interviewed two days crazy', 0.2886751345948129),
 ('interviewed two days crazy variety', 0.2886751345948129),
 ('statistical questions liked smart interviewers', 0.2886751345948129),
 ('screening next stage coding interview', 0.2886751345948129),
 ('interview finally greeted hiring board', 0.2886751345948129),
 ('asked teasers programming questions statistical', 0.28867513

Google TFIDF Process

In [201]:
googleprocess_vectorizer = TfidfVectorizer(ngram_range=(5,5), min_df=0.001, max_df = 0.75)
google_vectorized_process = pd.DataFrame(googleprocess_vectorizer.fit_transform(google_process_df['Process']).toarray(), columns = googleprocess_vectorizer.get_feature_names_out())
google_vectorized_process.loc['Total'] = google_vectorized_process.sum(numeric_only=True, axis=0)
google_vectorized_process = google_vectorized_process.sort_values(google_vectorized_process.last_valid_index(), axis=1, ascending=False)
googleprocess_sorted_word_list = [(col, google_vectorized_process[col].iloc[-1]) for col in google_vectorized_process.columns]
googleprocess_sorted_word_list

[('short interview nice talk product', 0.4588314677411235),
 ('interview nice talk product manager', 0.4588314677411235),
 ('interview although good made end', 0.37796447300922725),
 ('talking great excited code interview', 0.37796447300922725),
 ('recruiter questionnaire days applying online', 0.37796447300922725),
 ('great smooth talking great excited', 0.37796447300922725),
 ('online questionnaire schedule phone interview', 0.37796447300922725),
 ('great excited code interview although', 0.37796447300922725),
 ('code interview although good made', 0.37796447300922725),
 ('excited code interview although good', 0.37796447300922725),
 ('smooth talking great excited code', 0.37796447300922725),
 ('reply recruiter questionnaire days applying', 0.37796447300922725),
 ('got reply recruiter questionnaire days', 0.37796447300922725),
 ('days applying online questionnaire schedule', 0.37796447300922725),
 ('applying online questionnaire schedule phone', 0.37796447300922725),
 ('questionnaire

Microsoft TFIDF Process

In [202]:
microsoftprocess_vectorizer = TfidfVectorizer(ngram_range=(5,5), min_df=0.001, max_df = 0.75)
microsoft_vectorized_process = pd.DataFrame(microsoftprocess_vectorizer.fit_transform(microsoft_process_df['Process']).toarray(), columns = microsoftprocess_vectorizer.get_feature_names_out())
microsoft_vectorized_process.loc['Total'] = microsoft_vectorized_process.sum(numeric_only=True, axis=0)
microsoft_vectorized_process = microsoft_vectorized_process.sort_values(microsoft_vectorized_process.last_valid_index(), axis=1, ascending=False)
microsoftprocess_sorted_word_list = [(col, microsoft_vectorized_process[col].iloc[-1]) for col in microsoft_vectorized_process.columns]
microsoftprocess_sorted_word_list

[('six seven interviews one hr', 0.6324555320336759),
 ('hr rest algodesign questions process', 0.6324555320336759),
 ('algodesign questions process organized well', 0.6324555320336759),
 ('seven interviews one hr rest', 0.6324555320336759),
 ('questions process organized well phase', 0.6324555320336759),
 ('organized well phase time prepare', 0.6324555320336759),
 ('process organized well phase time', 0.6324555320336759),
 ('one hr rest algodesign questions', 0.6324555320336759),
 ('rest algodesign questions process organized', 0.6324555320336759),
 ('interviews one hr rest algodesign', 0.6324555320336759),
 ('applied job ad linkedin recruiter', 0.6030226891555273),
 ('link test test almost hours', 0.6030226891555273),
 ('test almost hours pass test', 0.6030226891555273),
 ('almost hours pass test explain', 0.6030226891555273),
 ('recruiter contacted sent link test', 0.6030226891555273),
 ('contacted sent link test test', 0.6030226891555273),
 ('sent link test test almost', 0.60302268

Amazon TFIDF Process

In [203]:
amazonprocess_vectorizer = TfidfVectorizer(ngram_range=(5,5), min_df=0.001, max_df = 0.75)
amazon_vectorized_process = pd.DataFrame(amazonprocess_vectorizer.fit_transform(amazon_process_df['Process']).toarray(), columns = amazonprocess_vectorizer.get_feature_names_out())
amazon_vectorized_process.loc['Total'] = amazon_vectorized_process.sum(numeric_only=True, axis=0)
amazon_vectorized_process = amazon_vectorized_process.sort_values(amazon_vectorized_process.last_valid_index(), axis=1, ascending=False)
amazonprocess_sorted_word_list = [(col, amazon_vectorized_process[col].iloc[-1]) for col in amazon_vectorized_process.columns]
amazonprocess_sorted_word_list

[('blah blah blah blah blah', 0.7302967433402215),
 ('take home assignment one interview', 0.7071067811865476),
 ('questions full day onsite interviews', 0.3691861432497283),
 ('interview take home assignment one', 0.3535533905932738),
 ('assignment one interview take home', 0.3535533905932738),
 ('one interview take home assignment', 0.3535533905932738),
 ('home assignment one interview take', 0.3535533905932738),
 ('fill code question ask technology', 0.3333333333333333),
 ('person nice sde ask fill', 0.3333333333333333),
 ('question ask technology never used', 0.3333333333333333),
 ('interview hard person nice sde', 0.3333333333333333),
 ('sde ask fill code question', 0.3333333333333333),
 ('hard person nice sde ask', 0.3333333333333333),
 ('ask fill code question ask', 0.3333333333333333),
 ('code question ask technology never', 0.3333333333333333),
 ('nice sde ask fill code', 0.3333333333333333),
 ('interview via campus placements nice', 0.31622776601683794),
 ('via campus placeme