Skip to content

Commit

Permalink
added to github
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Sep 20, 2018
1 parent b87a9cc commit ba13c18
Show file tree
Hide file tree
Showing 13 changed files with 851 additions and 36,744 deletions.
806 changes: 35 additions & 771 deletions .ipynb_checkpoints/demo-checkpoint.ipynb

Large diffs are not rendered by default.

369 changes: 369 additions & 0 deletions .ipynb_checkpoints/training_demo-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,369 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from lib import *\n",
"from features import *"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# importing data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('./train_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = data.sample(frac=0.4).reset_index(drop=True)\n",
"size = data.shape\n",
"size"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Visualisation\n",
"#showing data distribution over the four categories of headlines\n",
"m-Medical\n",
"e-Entertainment\n",
"b-Bussiness\n",
"t-Tech"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"plt.hist(data.CATEGORY.factorize()[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data samples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for line,i in zip(data['TITLE'],range(data['TITLE'].shape[0])):\n",
" data.loc[i,('TITLE')] = normalise_text(line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cv_matrix, cv = countVectorizer(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tv_matrix, tv = tfidfTransformer(cv_matrix)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"categories = data.CATEGORY.factorize()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"training_data, testing_data, training_op, test_op = split_data(tv_matrix,categories)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"svm = SVC()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"rfc_model = RandomForestClassifier(min_samples_split=4,criterion='entropy',random_state=10)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=4,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
" oob_score=False, random_state=10, verbose=0, warm_start=False)"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfc_model.fit(training_data,training_op)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9906786808113986"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfc_model.score(training_data,training_op)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8857193584659999"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rfc_model.score(testing_data,test_op)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"#url = \"http://www.bbc.com/\"\n",
"url = \"https://in.yahoo.com/?p=us\"\n",
"headlines = extract_hedlines(url)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"for lines, i in zip(headlines, range(len(headlines))):\n",
" headlines[i] = normalise_text(lines)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"testing_headlines = pd.DataFrame({\"TITLE\":headlines})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"testing_cv_matrix = cv.transform(testing_headlines['TITLE'])"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"testing_cv_matrix = testing_cv_matrix.toarray()\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"transformed_testing_data = tv.transform(testing_cv_matrix) "
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"prediction = rfc_model.predict(transformed_testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"opposition parties like dmk ncp rjd , jd ( ) , extended support bandh . t\n",
"weekly horoscope 10th sep 16th sep 2018 ) analysis provide insights crucial planetary movements impact life . stay tuned astroyogi ’ expert astrologers horoscope analysis . single , marriage proposals expected chances falling love first sight . take care health avoid smoking drinking alcohol . m\n",
"heartbreaking news chelsi smith first texan win miss universe crown , passed away year-long battle liver cancer age 45. former beauty queen bollywood actor sushmita sen , e\n",
"sanghavis car blue-coloured maruti ignis , found police near sector 11 airoli , navi mumbai , thursday . police officer said blood stains knife found rear seat car . b\n",
"shahid kapoor mira rajput became parents second time baby boy . e\n",
"paro derogatory term used women trafficked sold brides men haryana e\n",
"apache rtr 160 4v carburettor model got covered comes racing heritage tvs motor read detailed review new tvs apache rtr 160 4v e\n",
"aishwarya rai bachchan gets emotional listening national anthem event video aishwarya rai bachchan making rounds social media work front , aishwarya next seen husband abhishek bachchan gulab jamun . check video ! # aishwaryaraibachchan # aishwaryaemotional # nationalanthem e\n"
]
}
],
"source": [
"for i in range(testing_headlines['TITLE'].shape[0]):\n",
" if(len(testing_headlines.TITLE[i])>20):\n",
" print(testing_headlines.TITLE[i],\" \",data.CATEGORY.factorize()[1][prediction[i]])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Binary file modified __pycache__/features.cpython-36.pyc
Binary file not shown.
Binary file modified __pycache__/lib.cpython-36.pyc
Binary file not shown.
Binary file modified cv.pkl
Binary file not shown.
Loading

0 comments on commit ba13c18

Please sign in to comment.