# Préparation du dataset

In [None]:
#Cellule d'importation des packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from bokeh.plotting import figure, output_notebook, show
output_notebook()

In [None]:
# Importation du dataset
df=pd.read_csv('data-estimation.csv', sep=';')

In [None]:
#Remplacement des valeurs manquantes NaN de CustomerID par une catégorie "Client non identifié"
df['CustomerID'].fillna('Client non identifié', inplace =True)

In [None]:
#Suppression des lignes contenant une valeur manquante NaN dans la variable Description 
df= df.dropna(axis=0,how='any', subset=['Description'])

In [None]:
#cration d'une nouvelle colonne TotalAmount 
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

In [None]:
#Conversion de la colonne InvoiceDate en to_datetime (car initialement de type object)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
df['InvoiceYear'] = pd.to_datetime(df.InvoiceDate).dt.year
df['InvoiceMonth'] = pd.to_datetime(df.InvoiceDate).dt.month
df['InvoiceDay'] = pd.to_datetime(df.InvoiceDate).dt.weekday
df['Weekday'] = df.InvoiceDate.dt.day_name()
df['Week']=df.InvoiceDate.dt.week
df['InvoiceTime'] = pd.to_datetime(df.InvoiceDate).dt.time
#création d'une colonne heure car time est trop précis
df['InvoiceHour'] = pd.to_datetime(df.InvoiceDate).dt.hour

  """


In [None]:
# 0) création de la nouvelle colonne InvoiceType avec valeur par défaut 'abcd'
df['InvoiceType']='abcd'

# 1) 
#Catégorisation des dettes dans notre nouvelle colonne InvoiceType
df[df['InvoiceNo'].str.contains("A")]=df[df['InvoiceNo'].str.contains("A")].replace(to_replace=['abcd'],value=['Debt'])
#Catégorisation des annulations de commandes (retours) dans notre nouvelle colonne InvoiceType
df[df['InvoiceNo'].str.contains("C")]=df[df['InvoiceNo'].str.contains("C")].replace(to_replace=['abcd'],value=['Canceled'])
#Catégorisation des autres lignes en ventes 'Sale'
#Pour cela, on extrait les lignes pour lesquels InvoiceNo ne contient pas 'A' ou 'C'
df[-((df['InvoiceNo'].str.contains("A"))&(df['InvoiceNo'].str.contains("C")))]=df[-((df['InvoiceNo'].str.contains("A"))&(df['InvoiceNo'].str.contains("C")))].replace(to_replace=['abcd'],value=['Sale'])

# 2) Modification des catégories 'Canceled' et 'Sale' en fonction du stockCode et ce à quoi ça correspond
# catégorie Discount
df[df['StockCode']=='D']=df[df['StockCode']=='D'].replace(to_replace=['Canceled','Sale'],value=['Discount','Discount'])
# catégorie Amazon fee
df[df['StockCode']=='AMAZONFEE']=df[df['StockCode']=='AMAZONFEE'].replace(to_replace=['Canceled','Sale'],value=['Amazon fee','Amazon fee'])
# catégorie Commission
df[df['StockCode']=='CRUK']=df[df['StockCode']=='CRUK'].replace(to_replace=['Canceled','Sale'],value=['Commission','Commission'])
# catégorie Sample 
df[df['StockCode']=='S']=df[df['StockCode']=='S'].replace(to_replace=['Canceled','Sale'],value=['Sample','Sample'])
# catégorie Manual order
df[df['StockCode']=='M']=df[df['StockCode']=='M'].replace(to_replace=['Sale','Canceled'],value = ['Manual order','Manual order'])
# catégorie Postage
df[(df['StockCode']=='POST')|(df['StockCode']=='DOT')]=df[(df['StockCode']=='DOT')|(df['StockCode']=='POST')].replace(to_replace=['Sale','Canceled'],value=['Postage','Postage'])
# catégorie Carriage
df[df['StockCode']=='C2']=df[df['StockCode']=='C2'].replace(to_replace=['Sale','Canceled'],value=['Carriage','Carriage'])
# catégorie Bank charges
df[df['StockCode']=='BANK CHARGES']=df[df['StockCode']=='BANK CHARGES'].replace(to_replace=['Sale','Canceled'],value=['Bank charges','Bank charges'])



In [None]:
# 3) On veut maintenant catégoriser les lignes pour lesquels on a des description qui semblent correspondre à des problèmes
# on crée d'abord une liste contenant les descriptions 'problèmes'
Description_problems = ['amazon', '?', 'check', 'damages','faulty', 'Dotcom sales', 'amazon sales', 'Found',
       'reverse 21/5/10 adjustment','mouldy, thrown away.', 'found', 'counted', 'Given away', 'Dotcom', 'label mix up',
       'samples/damages', 'thrown away', 'incorrectly made-thrown away.','showroom', 'MIA', 'Adjustment',
       'Dotcom set', 'wrongly sold as sets', 'Amazon sold sets','dotcom sold sets', 'wrongly sold sets', 
       '? sold as sets?','?sold as sets?', 'Thrown away.', 'damages/display','damaged stock', 'broken', 'throw away',
       'wrong barcode (22467)', 'wrongly sold (22719) barcode','wrong barcode', 'barcode problem', '?lost',
       "thrown away-can't sell.", "thrown away-can't sell",'rcvd be air temp fix for dotcom sit', 'damages?',
       're dotcom quick fix.', "Dotcom sold in 6's", 'sold in set?','cracked', 'sold as 22467', 'Damaged',
       'mystery! Only ever imported 1800','MERCHANT CHANDLER CREDIT ERROR, STO', 'POSSIBLE DAMAGES OR LOST?',
       'damaged', 'DAMAGED', 'did  a credit  and did not tick ret','Display', 'Missing', 'adjustment', 'returned', 'wrong code?',
       'wrong code', 'adjust', 'crushed', 'damages/showroom etc','samples', 'damages/credits from ASOS.', 'mailout ', 'mailout',
       'Not rcvd in 10/11/2010 delivery', 'Thrown away-rusty','sold as set/6 by dotcom', 'wet/rusty', 'damages/dotcom?',
       'on cargo order', 'smashed','incorrectly credited C550456 see 47','reverse previous adjustment', 'wet damaged',
       'Water damaged','missing', 'sold as set on dotcom','sold as set on dotcom and amazon', 'water damage',
       'sold as set by dotcom', 'Printing smudges/thrown away','printing smudges/thrown away',
       'to push order througha s stock was ', 'found some more on shelf','Show Samples', 'FOUND', 'mix up with c',
       'mouldy, unsaleable.', 'wrongly marked. 23343 in box', 'came coded as 20713',
       'alan hodge cant mamage this section','dotcom', 'FBA','stock creditted wrongly', 'ebay',
       'incorrectly put back into stock', 'Damages/samples','Sold as 1 on dotcom', 'taig adjust no stock', 
       'code mix up? 84930', '?display?', 'sold as 1', '?missing','crushed ctn', 'Crushed', 'test', 'temp adjustment',
       'taig adjust','allocate stock for dotcom orders ta', '??','add stock to allocate online orders', 
       'for online retail orders','Amazon', 'found box', 'OOPS ! adjustment', 'Found in w/hse',
       'website fixed', 'Dagamed', 'historic computer difference?....se','Lighthouse Trading zero invc incorr',
       'Incorrect stock entry.','incorrect stock entry.', 'michel oops', 'wrongly coded 20713',
       'wrongly coded-23343', 'stock check', 'crushed boxes','WET/MOULDY', "can't find", 'mouldy',
       'Wet pallet-thrown away','Had been put aside.', 'Sale error', 'Amazon Adjustment',
       'wrongly marked 23343', '20713 wrongly marked', 're-adjustment','Breakages', 'Marked as 23343', '20713', 
       'wrongly coded 23343','Found by jackie', 'Damages', 'CHECK', 'Unsaleable, destroyed.',
       'wrongly marked', 'dotcom sales', 'had been put aside','damages wax', 'water damaged', 'Wrongly mrked had 85123a in box',
       'wrongly marked carton 22804', 'missing?', 'wet rusty','amazon adjust', '???lost', 'dotcomstock',
       'John Lewis','sold with wrong barcode', 'dotcom adjust', 'rusty thrown away','rusty throw away', 'check?',
       '?? missing', 'wet pallet','????missing', '???missing', 'AMAZON', 'lost in space', 'wet?',
       'lost??', '???', 'wet', 'wet boxes','????damages????', 'mixed up', 'lost']

In [None]:
# catégorisation 'Order problem' lorsque la colonne Descirption comporte une valeur appartenant à la liste Description_problems
df[df.Description.isin(Description_problems)] = df[df.Description.isin(Description_problems)].replace(to_replace='Sale',value='Order problem')
# 4) catégorisation des lignes pour lesquelles on a un prix == 0 (hors Order problem)
df[df['UnitPrice']==0] = df[df['UnitPrice']==0].replace(to_replace='Sale', value='Gift')

In [None]:
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']

In [None]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalAmount,InvoiceYear,InvoiceMonth,InvoiceDay,Weekday,Week,InvoiceTime,InvoiceHour,InvoiceType
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,2010,12,2,Wednesday,48,08:26:00,8,Sale
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,Wednesday,48,08:26:00,8,Sale
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,2010,12,2,Wednesday,48,08:26:00,8,Sale
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,Wednesday,48,08:26:00,8,Sale
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2010,12,2,Wednesday,48,08:26:00,8,Sale


In [None]:
#Création d'un dataframe comprenant uniquement les ventes (InvoiceType = Sale)
df_sale = df[(df['InvoiceType']=='Sale')]

In [None]:
#Création d'un dataframe comprenant uniquement les annulations de commandes (=retours) (InvoiceType = Canceled)
df_canceled=df[df['InvoiceType']=='Canceled']


# Brouillon test - régression


In [None]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'TotalAmount', 'InvoiceYear',
       'InvoiceMonth', 'InvoiceDay', 'Weekday', 'Week', 'InvoiceTime',
       'InvoiceHour', 'InvoiceType'],
      dtype='object')

In [None]:
type(df.columns)

pandas.core.indexes.base.Index

In [None]:
df.dtypes

InvoiceNo               object
StockCode               object
Description             object
Quantity                 int64
InvoiceDate     datetime64[ns]
UnitPrice              float64
CustomerID              object
Country                 object
TotalAmount            float64
InvoiceYear              int64
InvoiceMonth             int64
InvoiceDay               int64
Weekday                 object
Week                     int64
InvoiceTime             object
InvoiceHour              int64
InvoiceType             object
dtype: object

In [None]:
df.StockCode.dtype

dtype('O')

In [None]:
df.Description.dtype

dtype('O')

In [None]:
df.InvoiceDate.dtype

dtype('<M8[ns]')

In [None]:
df.Quantity.dtype

dtype('int64')

In [None]:
df.TotalAmount.dtype

dtype('float64')

In [None]:
print(df['TotalAmount'].dtype)

float64


In [None]:
df['TotalAmount'].dtype

dtype('float64')

In [None]:
print(df['TotalAmount'].dtype) == 'float64'

float64
False


In [None]:
#liste des colonnes quali
columns_quali = []
for column in df.columns :
  type = df[column].dtype
  if type == 'object' :
    columns_quali.extend([column])

In [None]:
columns_quali

In [None]:
#liste des colonnes quanti
columns_quanti = []
for column in df.columns :
  type = df[column].dtype
  if type == 'float64' or type == 'int64':
    columns_quanti.extend([column])

columns_quanti

In [None]:
#transformer les va quali en indicatrices
columns_quali2 = ['Weekday','InvoiceType']

for column in columns_quali2 : 
  df = df.join(pd.get_dummies(df[column], prefix=column))
df.head()

In [None]:
#transformation standardisation
from sklearn import model_selection, preprocessing

columns_quanti2 = ['Quantity','UnitPrice','TotalAmount']

for column in columns_quanti2 : 
  scaler = preprocessing.StandardScaler().fit(df[[column]])
  df[column] = pd.DataFrame(scaler.transform(df[[column]]), index= df.index)
df.head()

In [None]:
#supression des autres colonnes 
df=df.drop(columns_quali, axis=1)

In [None]:
#séparation va cible et data
feats = df.drop(['SalesPrice'], axis=1)
target = df.SalesPrice

In [None]:
#ens d'apprentissage et ens de test
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=0.2, random_state= 101)

In [None]:
#essai test
# création d'une fonction rmse_cv
def rmse_cv(modele, X_train, y_train):
    return np.sqrt(-cross_val_score(modele,X_train,y_train, scoring = 'neg_mean_squared_error').mean())

rmse(lr,X_train, y_train )

In [None]:
#calcul du rmse pour chaque alpha pour ensuite afficher un graph
alphas = [0.01, 0.05, 0.1, 0.3, 0.8, 1, 5, 10, 15, 30, 50]
liste_rmse =[]

from sklearn.linear_model import Ridge

for alpha in alphas :
  ridge_reg = Ridge(alpha= alpha)
  ridge_reg.fit(X_train, y_train) 
  rmse = rmse_cv(ridge_reg, X_train, y_train )
  liste_rmse.extend([rmse])

liste_rmse

In [None]:
#afficahge graphique
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(alphas,liste_rmse)
plt.scatter(alphas, liste_rmse, c = 'red')

In [None]:
#modèle de regression ridge performant 
#plus l'erreur quadratique moyenne est proche de 0, plus précises sont les prédictions 
#on prend donc alpha = ...

from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha = 50)
ridge_reg.fit(X_train, y_train) 

# score R2 sur ens train et ens de test
print("score train :", ridge_reg.score(X_train, y_train))
print("score test :", ridge_reg.score(X_test, y_test))

#erreur quadratique moyenne de prédiction pour ridge_pred_train et ridge_pred_test, grâce à la fonction mean_squarred_error
from sklearn.metrics import mean_squared_error
ridge_pred_train = ridge_reg.predict(X_train)
ridge_pred_test = ridge_reg.predict(X_test)

print("mse train:", mean_squared_error(ridge_pred_train, y_train))
print("mse test:", mean_squared_error(ridge_pred_test, y_test))

In [None]:
#regression Lasso
from sklearn.linear_model import LassoCV

lasso_reg = LassoCV(alphas = (10, 1, 0.1, 0.001, 0.0005))

lasso_reg.fit(X_train, y_train)

print("score train:",lasso_reg.score(X_train,y_train))
print("score test :", lasso_reg.score(X_test,y_test))

lasso_pred_train = lasso_reg.predict(X_train)
lasso_pred_test = lasso_reg.predict(X_test)

print("mse train:", mean_squared_error(lasso_pred_train, y_train))
print("mse test:", mean_squared_error(lasso_pred_test, y_test))

In [None]:
#Les coefficients estimés sont récupérables, comme pour toute régression linéaire, dans l'attribut coef_
lasso_coef = lasso_reg.coef_

plt.plot(range(len(data.columns)), lasso_coef)
plt.xticks(range(len(data.columns)), data.columns.values, rotation=70)
plt.show()

In [None]:
coeffs = list(lasso_reg.coef_)
#coeffs.insert(0, lasso_reg.intercept_)
feats = list(data.columns)
#feats.insert(0, 'intercept')

pd.DataFrame({'valeur estimée': coeffs}, index = feats)

# Brouillon test - text mining

In [None]:
#quest 1 
paroles = df.Text

r = re.compile(r"\[(.+?\]")
paroles = r.sub(" ", str(paroles))

#quest 2 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

mots_vides =['?', ....]
stop_words.update(mots_vides)

In [None]:
#quest 3 
form nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()
mots = tokenizer.tokenize(paroles)

# définir la fonction stop_words_filtering
def stop_words_filetring(mots) : 
    tokens = []
    for mot in mots:
        if mot not in stop_words:
            tokens.append(mot)
    return tokens

stop_words_filetring(mots)   

In [None]:
#quest 4
#Importer les packages nécessaires
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

# Définir le calque du nuage des mots
wc = WordCloud(background_color="white", max_words=1000, stopwords=stop_words, max_font_size=90, random_state=42, collocation = False, mask = mask)

plt.figure(figsize= (10,6)) # Initialisation d'une figure
wc.generate(mots)           # "Calcul" du wordcloud
plt.imshow(wc) # Affichage
plt.show()

#plot_word_cloud(mots, "trump.jpg") ou mask ?



In [None]:
#quest 5
from wordcloud import ImageColorGenerator
help(ImageColorGenerator)
img_color=ImageColorGenerator(mask)
wc.recolor(color_func = img_color)

plt.figure(figsize= (10,6)) # Initialisation d'une figure
wc.generate(mots)           # "Calcul" du wordcloud
plt.imshow(wc, interpolation = 'bilinear') # Affichage

plt.show()

#fig = plt.imshow(wc, interpolation = 'bilinear')  
#fig.axes.get_xaxis().set_visible(False)
#fig.axes.get_yaxis().set_visible(False)


In [None]:
#quest 6 : ens de test et train
# Importer la classe train_test 
from sklearn.model_selection import train_test_split

# Séparer la variable explicative de la variable à prédire
X, y = df.Paroles, df.Sentiment

# Séparer le jeu de données en données d'entraînement et données test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [None]:
#quest 7 : vectorisation 
from sklearn.feature_extraction.text import CountVectorizer

# Initialiser un objet vectorisateur
vectorizer = CountVectorizer()

# Mettre à jour la valeur de X_train et X_test
X_train = vectorizer.fit_transform(X_train).todense()
X_test = vectorizer.transform(X_test).todense()



In [None]:
#quest 8 : entraînement du modele
from sklearn import linear_model

# Création du classifier et construction du modèle sur les données d'entraînement
clf = linear_model.LogisticRegression(C = 1.0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


In [None]:
#quest 9 : performances du modèle

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#matrice de confusion
cm = pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite'])
cm
clf.score(X_test, y_test)

from sklearn.ensemble import GradientBoostingClassifier

# Créer un classificateur clf et entraîner le modèle sur l'ensemble d'entraînement
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)

# Calculer les prédictions 
y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

# Calcul et affichage de classification_report
print( classification_report(y_test, y_pred) )

# Calcul et affichage de la matrice de confusion
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Classe réelle'], colnames=['Classe prédite'])
confusion_matrix



# Brouillon test webscrapping

In [None]:
#QCM 
#quest 1 : id 
#quest 2 : H1
#quest 3 : ul
#quest 4 : 4eme reponse
#quest 5 : src
#quest 6 : 3eme réponse
#quest 7 : 2eme réponse

In [3]:
#partie 2 
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas

page = urlopen("https://www.rottentomatoes.com/top/bestofrt/top_100_action__adventure_movies/")
soup = BeautifulSoup(page, 'html.parser')

noms = soup.findAll(name='a', attrs = {'class': 'unstyled articleLink'})
titres_SC=[]
for element in noms:
    titres_SC.append(element.text.strip())

print(titres_SC)

['Opening This Week', 'Top Box Office', 'Coming Soon to Theaters', 'Weekend Earnings', 'Certified Fresh Movies', 'On Dvd & Streaming', 'FandangoNOW', 'Netflix Streaming', 'iTunes', 'Amazon and Amazon Prime', 'Top DVD & Streaming', 'New Releases', 'Coming Soon to DVD', 'Certified Fresh Movies', 'Browse All', 'Top Movies', 'Trailers', 'Forums', 'View All', 'View All', 'Top TV Shows', 'Certified Fresh TV', '24 Frames', 'All-Time Lists', 'Binge Guide', 'Comics on TV', 'Countdown', 'Critics Consensus', 'Five Favorite Films', 'Now Streaming', 'Parental Guidance', 'Red Carpet Roundup', 'Scorecards', 'Sub-Cult', 'Total Recall', 'Video Interviews', 'Weekend Box Office', 'Weekly Ketchup', 'What to Watch', 'The Zeros', 'View All', 'View All', 'View All', 'Black Panther (2018)', 'Avengers: Endgame (2019)', 'Mission: Impossible - Fallout (2018)', 'Mad Max: Fury Road (2015)', 'Spider-Man: Into the Spider-Verse (2018)', 'Wonder Woman (2017)', 'Dunkirk (2017)', 'Coco (2017)', 'Thor: Ragnarok (2017)', 

In [4]:
del titres_SC[0:43]
print(titres_SC)

['Black Panther (2018)', 'Avengers: Endgame (2019)', 'Mission: Impossible - Fallout (2018)', 'Mad Max: Fury Road (2015)', 'Spider-Man: Into the Spider-Verse (2018)', 'Wonder Woman (2017)', 'Dunkirk (2017)', 'Coco (2017)', 'Thor: Ragnarok (2017)', 'Seven Samurai (Shichinin no Samurai) (1956)', 'Logan (2017)', 'Star Wars: The Last Jedi (2017)', 'Star Wars: Episode VII - The Force Awakens (2015)', 'The Adventures of Robin Hood (1938)', 'Spider-Man: Far From Home (2019)', 'Incredibles 2 (2018)', 'King Kong (1933)', 'Zootopia (2016)', 'War for the Planet of the Apes (2017)', 'Baby Driver (2017)', 'Spider-Man: Homecoming (2017)', 'Metropolis (1927)', 'Jaws (1975)', 'Up (2009)', 'Shazam! (2019)', 'The Dark Knight (2008)', 'The Treasure of the Sierra Madre (1948)', 'Blade Runner 2049 (2017)', 'The French Connection (1971)', 'The 39 Steps (1935)', 'Captain America: Civil War (2016)', 'Skyfall (2012)', 'Harry Potter and the Deathly Hallows - Part 2 (2011)', 'The Jungle Book (2016)', 'Apocalypse 

In [5]:
num_critique_SC=[]
for element in soup.findAll('td', attrs={'class':'right hidden-xs'}) :
  num_critique_SC.append(element.text.strip("()")) #il faut retirer les parenthèses

note_SC=[]
for element in soup.findAll('span', attrs={'class':'tMeterIcon tiny'}) :
  note_SC.append(element.text.strip())



In [8]:
rotten_tomatoes = pandas.DataFrame(list(zip(titres_SC,num_critique_SC,note_SC)), columns=["Titre","Num_critique","Note"])

In [9]:
rotten_tomatoes.head()

Unnamed: 0,Titre,Num_critique,Note
0,Black Panther (2018),521,96%
1,Avengers: Endgame (2019),541,94%
2,Mission: Impossible - Fallout (2018),434,97%
3,Mad Max: Fury Road (2015),426,97%
4,Spider-Man: Into the Spider-Verse (2018),390,97%


In [13]:
#partie 3
#pip install google
from googlesearch import search

for url in search('data science', tld='com', lang='en',
                  num= 20, pause= 2.0):
    print (url)

https://home.kpmg/xx/en/home/insights/2019/07/the-emergence-of-data-science-in-pe.html
https://en.wikipedia.org/wiki/Data_science
https://en.wikipedia.org/wiki/Master_in_Data_Science
https://en.wikipedia.org/wiki/Big_data
https://en.wikipedia.org/wiki/Unstructured_data
https://en.wikipedia.org/wiki/Information_science
https://ischoolonline.berkeley.edu/data-science/what-is-data-science/
https://www.coursera.org/specializations/jhu-data-science
https://www.coursera.org/specializations/jhu-data-science#instructors
https://www.coursera.org/specializations/jhu-data-science#enroll
https://www.coursera.org/browse/data-science
https://www.zdnet.com/article/there-are-hundreds-of-thousands-of-jobs-in-ai-those-are-the-skills-you-need-to-get-one/
https://www.utsa.edu/today/2021/05/story/mongeau-david-lead-school-data-science.html
https://wtop.com/business-finance/2021/05/howard-university-teams-with-amazon-for-a-data-science-masters-degree/
https://www.edx.org/course/subject/data-science
https://

In [15]:
from Newspaper import Article
url = 'https://en.wikipedia.org/wiki/Data_science'
article = Article(url)
article.download()
article.parse()
print(article.text)

ModuleNotFoundError: ignored

In [None]:
article.nlp() # on fait avant tout fonctionner le langage naturel
article.authors 
article.publish_date
article.summary
article.keywords