In [1]:
# Load the pre-processed data
import pandas as pd
import numpy as np
import pickle

In [2]:
f=open('preprocessed_data.pkl','rb')
df=pickle.load(f)
f.close()

In [3]:
df.head()

Unnamed: 0,story,role,story_point
0,data user want 12192017 delet process,user,9
1,ui design want redesign resourc page match new...,designer,5
2,ui design want report agenc user test awar con...,designer,8
3,ui design want move round 2 dab fab land page ...,designer,5
4,ui design want move round 2 homepag edit get a...,designer,8


## First we need to convert the *role* Column into numbers

In [4]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [5]:
df['le_role']=le.fit_transform(df['role'])

In [6]:
df[df['role']=='analyst']

Unnamed: 0,story,role,story_point,le_role
276,machin learn expert would like packag ml datas...,analyst,48,3
290,research work data want microsoft power bi int...,analyst,29,3
324,practition want includ addit detail profil pag...,analyst,34,3
327,practition want write articl site small graphi...,analyst,45,3
632,data analyst want download data packag studi w...,analyst,41,3
633,data analyst want updat previous download data...,analyst,37,3
635,data analyst want compar differ version datapa...,analyst,37,3
716,data contributor want refer disast recoveri pl...,analyst,49,3
754,data scientist want abl creat dataset instanc ...,analyst,27,3
755,data scientist want abl upgrad dataset instanc...,analyst,44,3


In [7]:
df['role']=df['le_role']

In [8]:
df=df.drop('le_role',axis=1)

In [9]:
df

Unnamed: 0,story,role,story_point
0,data user want 12192017 delet process,52,9
1,ui design want redesign resourc page match new...,17,5
2,ui design want report agenc user test awar con...,17,8
3,ui design want move round 2 dab fab land page ...,17,5
4,ui design want move round 2 homepag edit get a...,17,8
...,...,...,...
1666,zoonivers admin want know zoonibot give explan,1,17
1667,zoonivers admin want know zoonibot say volunt,1,22
1668,zoonivers admin want group subject similar,1,15
1669,zoonivers admin want recommend differ project ...,1,13


## Vectorization using TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [11]:
ar_tfidf=tfidf.fit_transform(df['story']).toarray()

In [12]:
ar_tfidf.shape

(1671, 2217)

In [13]:
# We need to preserve this transformer to transform the input story for Prediction
f=open('tfidf_trans.trans','wb')
pickle.dump(tfidf,f)
f.close()

In [14]:
column_names=[str(i) for i in range(ar_tfidf.shape[1])]

In [15]:
df_TFIDF=pd.DataFrame(data=ar_tfidf,columns=column_names)

In [16]:
df_TFIDF['role']=df['role']

In [17]:
df_TFIDF['story_point']=df['story_point']

In [18]:
df_TFIDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2209,2210,2211,2212,2213,2214,2215,2216,role,story_point
0,0.0,0.0,0.0,0.0,0.682697,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,52,9
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,17,5
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,17,8
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,17,5
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,17,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1666,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.511533,0.355934,1,17
1667,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.506680,0.352557,1,22
1668,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.429315,1,15
1669,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.304823,1,13


## Train Test Splitting 

In [19]:
# We make different sets for training and testing
from sklearn.model_selection import train_test_split


In [20]:
X=df_TFIDF.iloc[:,:-1]
y=df_TFIDF.iloc[:,-1]

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [22]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1336, 2218), (335, 2218), (1336,), (335,))

## Training the Model

### 1. Using SVM

In [23]:
# We will try Support vector machine regressor to train the model
from sklearn.svm import SVR
svr=SVR()

In [24]:
svr.fit(X_train,y_train)

SVR()

In [25]:
y_pred=svr.predict(X_test)

In [26]:
y_pred=y_pred.astype('int8')

#### Checking Accuracy

In [27]:
from sklearn.metrics import r2_score


In [28]:
svr_score=r2_score(y_test,y_pred)

In [29]:
svr_score

0.6486637273734805

### 2. Using Random Forest Regressor

In [30]:
# We will try Rf Regressor 
from sklearn.ensemble import RandomForestRegressor

In [31]:
rfr=RandomForestRegressor(n_estimators=500)

In [32]:
rfr.fit(X_train,y_train)

RandomForestRegressor(n_estimators=500)

In [33]:
y_pred=rfr.predict(X_test)

In [34]:
# Check Accuracy
rfr_score=r2_score(y_test,y_pred)
rfr_score

0.7724900543429108

## Our Accuracy :
1. Using SVR 64.8%
2. Using RFR 77.8%

### We can do Following to improve accuracy of the model:
1. Use other Vectorization Technique Like CBOW, Word2Vec, Doc2Vec, BERT etc.
2. Can Do custome feature creation in our dataframe
3. Use different ML Algorithm Gaussian Naive Bayes, Linear Regression, MultiVariate Regression, Lasso Regression etc
4. Can go for Deep Learning Methods if we have enough data

# Preserving the model
### We can preserve both the model for prediction

In [35]:
f=open('svr.model','wb')
pickle.dump(svr,f)
f.close()

In [36]:
k=open('rfr.model','wb')
pickle.dump(rfr,k)
k.close()